X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fviblnd%2Fviblnd.c;h=8f93d530622a303a16cf36b7869b4d0b16cdf470;hb=dc3fe5a1019831fcfcbb177356a241ff6c33158a;hp=50e1149e48708f1155e4392026df7ef8f759fdc5;hpb=0ffa249e8f3811a9f1c0c3803bdc8e6fb8435f43;p=fs%2Flustre-release.git diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 50e1149..8f93d53 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -1,330 +1,513 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * Author: Frank Zago + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/viblnd/viblnd.c + * + * Author: Eric Barton + * Author: Frank Zago */ -#include "vibnal.h" - -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_tunables_t kibnal_tunables; +#include "viblnd.h" -kib_data_t kibnal_data = { - .kib_service_id = IBNAL_SERVICE_NUMBER, +lnd_t the_kiblnd = { + .lnd_type = VIBLND, + .lnd_startup = kibnal_startup, + .lnd_shutdown = kibnal_shutdown, + .lnd_ctl = kibnal_ctl, + .lnd_send = kibnal_send, + .lnd_recv = kibnal_recv, + .lnd_eager_recv = kibnal_eager_recv, }; -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 +kib_data_t kibnal_data; -#define IBNAL_SYSCTL_TIMEOUT 1 +void vibnal_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G + * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */ + + + /* Constants... */ + CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91); + CLASSERT (IBNAL_MSG_VERSION == 0x11); + CLASSERT (IBNAL_MSG_CONNREQ == 0xc0); + CLASSERT (IBNAL_MSG_CONNACK == 0xc1); + CLASSERT (IBNAL_MSG_NOOP == 0xd0); + CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1); + CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2); + CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3); + CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4); + CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5); + CLASSERT (IBNAL_MSG_GET_REQ == 0xd6); + CLASSERT (IBNAL_MSG_GET_DONE == 0xd7); + + /* Checks for struct kib_connparams_t */ + CLASSERT ((int)sizeof(kib_connparams_t) == 12); + CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0); + CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4); + CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4); + CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4); + CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8); + CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4); + + /* Checks for struct kib_immediate_msg_t */ + CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72); + CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0); + CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72); + CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85); + CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1); + CLASSERT (IBNAL_USE_FMR == 1); + + /* Checks for struct kib_rdma_desc_t */ + CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0); + CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8); + CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12); + CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4); + + /* Checks for struct kib_putreq_msg_t */ + CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80); + CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0); + CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72); + CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72); + CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8); + + /* Checks for struct kib_putack_msg_t */ + CLASSERT ((int)sizeof(kib_putack_msg_t) == 32); + CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0); + CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8); + CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8); + CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8); + CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16); + CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16); + + /* Checks for struct kib_get_msg_t */ + CLASSERT ((int)sizeof(kib_get_msg_t) == 96); + CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0); + CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72); + CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72); + CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8); + CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80); + CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16); + + /* Checks for struct kib_completion_msg_t */ + CLASSERT ((int)sizeof(kib_completion_msg_t) == 12); + CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0); + CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8); + CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8); + CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4); + + /* Checks for struct kib_msg_t */ + CLASSERT ((int)sizeof(kib_msg_t) == 152); + CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4); + CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2); + CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1); + CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1); + CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4); + CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4); + CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8); + CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8); + CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8); + CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8); + CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96); + CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); +} -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; +__u32 +kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} -#ifdef unused void -print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) { - char name[32]; - - if (service == NULL) - { - CWARN("tag : %s\n" - "status : %d (NULL)\n", tag, rc); - return; - } - strncpy (name, service->ServiceName, sizeof(name)-1); - name[sizeof(name)-1] = 0; - - CWARN("tag : %s\n" - "status : %d\n" - "service id: "LPX64"\n" - "name : %s\n" - "NID : "LPX64"\n", tag, rc, - service->RID.ServiceID, name, - *kibnal_service_nid_field(service)); + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; } -#endif -/* - * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported. - * nid is the nid to advertize/query/unadvertize - */ -static void fill_sa_request(struct sa_request *request, int method, ptl_nid_t nid) +void +kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, + lnet_nid_t dstnid, __u64 dststamp, __u64 seq) { - gsi_dtgrm_t *dtgrm = request->dtgrm_req; - sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; - ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload; - - memset(mad, 0, MAD_BLOCK_SIZE); - - request->mad = mad; - - dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid; - dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level; - - mad->hdr.base_ver = MAD_IB_BASE_VERSION; - mad->hdr.class = MAD_CLASS_SUBN_ADM; - mad->hdr.class_ver = 2; - mad->hdr.m.ms.method = method; - mad->hdr.attrib_id = SA_SERVICE_RECORD; /* something(?) will swap that field */ - - /* Note: the transaction ID is set by the Voltaire stack if it is 0. */ - - /* TODO: change the 40 to sizeof(something) */ - mad->payload_len = cpu_to_be32(0x40 /*header size */ + - sizeof (ib_service_record_v2_t)); - - - mad->component_mask = cpu_to_be64( - (1ull << 0) | /* service_id */ - (1ull << 2) | /* service_pkey */ - (1ull << 6) | /* service_name */ - (1ull << 7) | /* service_data8[0] */ - (1ull << 8) | /* service_data8[1] */ - (1ull << 9) | /* service_data8[2] */ - (1ull << 10) | /* service_data8[3] */ - (1ull << 11) | /* service_data8[4] */ - (1ull << 12) | /* service_data8[5] */ - (1ull << 13) | /* service_data8[6] */ - (1ull << 14) /* service_data8[7] */ - ); - - sr->service_id = cpu_to_be64(kibnal_data.kib_service_id); - sr->service_pkey = cpu_to_be16(kibnal_data.kib_port_pkey); - - /* Set the service name and the data (bytes 0 to 7) in data8 */ - kibnal_set_service_keys(sr, nid); - - if (method == SUBN_ADM_SET) { - mad->component_mask |= cpu_to_be64( - (1ull << 1) | /* service_gid */ - (1ull << 4) /* service_lease */ - ); - - sr->service_gid = kibnal_data.kib_port_gid; - gid_swap(&sr->service_gid); - sr->service_lease = cpu_to_be32(0xffffffff); - } - - CDEBUG(D_NET, "SA request %02x for service id "LPX64" %s:"LPX64"\n", - mad->hdr.m.ms.method, - sr->service_id, - sr->service_name, - *kibnal_service_nid_field(sr)); + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBNAL_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid; + msg->ibm_srcstamp = kibnal_data.kib_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + msg->ibm_seq = seq; + + if (*kibnal_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); + } } -/* Do an advertizement operation: - * SUBN_ADM_GET = 0x01 (i.e. query), - * SUBN_ADM_SET = 0x02 (i.e. advertize), - * SUBN_ADM_DELETE = 0x15 (i.e. un-advertize). - * If callback is NULL, the function is synchronous (and context is ignored). - */ -int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context) +int +kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) { - struct sa_request *request; - int ret; + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + __u32 msg_version; + int flip; + int msg_nob; +#if !IBNAL_USE_FMR + int i; + int n; +#endif + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + /* Future protocol version compatibility support! + * If the viblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will negotiate a + * protocol version. If I find this, I avoid any console errors. If + * my is doing connection establishment, the reject will tell the peer + * which version I'm running. */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { + flip = 1; + } else { + if (msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + return -EPROTO; - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + /* Completely out to lunch */ + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } - CDEBUG(D_NET, "kibnal_advertize_op: nid="LPX64", op=%d\n", nid, op); + msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (expected_version == 0) { + if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + msg_version != IBNAL_MSG_VERSION) + return -EPROTO; + } else if (msg_version != expected_version) { + CERROR("Bad version: %x(%x expected)\n", + msg_version, expected_version); + return -EPROTO; + } - request = alloc_sa_request(); - if (request == NULL) { - CERROR("Cannot allocate a SA request"); - return -ENOMEM; + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; } - - fill_sa_request(request, op, nid); - if (callback) { - request->callback = callback; - request->context = context; - } else { - init_completion(&request->signal); + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; } - ret = vibnal_start_sa_request(request); - if (ret) { - CERROR("vibnal_send_sa failed: %d\n", ret); - free_sa_request(request); - } else { - if (callback) { - /* Return. The callback will have to free the SA request. */ - ret = 0; - } else { - wait_for_completion(&request->signal); + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kibnal_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + msg->ibm_cksum = msg_cksum; - ret = request->status; + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + msg->ibm_version = msg_version; + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + __swab64s(&msg->ibm_seq); + } - if (ret != 0) { - CERROR ("Error %d in advertising operation %d for NID "LPX64"\n", - ret, op, kibnal_data.kib_nid); - } - - free_sa_request(request); - } + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; } - return ret; -} + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; -static int -kibnal_set_mynid(ptl_nid_t nid) -{ - struct timeval tv; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; - vv_return_t retval; + case IBNAL_MSG_NOOP: + break; - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); + case IBNAL_MSG_IMMEDIATE: + if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { + CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); + return -EPROTO; + } + break; - do_gettimeofday(&tv); + case IBNAL_MSG_PUT_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { + CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putreq))); + return -EPROTO; + } + break; - down (&kibnal_data.kib_nid_mutex); + case IBNAL_MSG_PUT_ACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putack))); + return -EPROTO; + } +#if IBNAL_USE_FMR + if (flip) { + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); + } - if (nid == kibnal_data.kib_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } + n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); + return -EPROTO; + } - /* Unsubscribes the current NID */ - if (kibnal_data.kib_nid != PTL_NID_ANY) { + if (flip) { + for (i = 0; i < n; i++) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi); + } + } +#endif + break; - rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); + case IBNAL_MSG_GET_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.get))); + return -EPROTO; + } +#if IBNAL_USE_FMR + if (flip) { + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); + } + + n = msg->ibm_u.get.ibgm_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } - if (rc) { - CERROR("Error %d unadvertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); + if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); + return -EPROTO; } + + if (flip) + for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi); + } +#endif + break; + + case IBNAL_MSG_PUT_NAK: + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { + CERROR("Short RDMA completion: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.completion))); + return -EPROTO; + } + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBNAL_MSG_CONNREQ: + case IBNAL_MSG_CONNACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { + CERROR("Short connreq/ack: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.connparams))); + return -EPROTO; + } + if (flip) { + __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); + } + break; } - - kibnal_data.kib_nid = ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + return 0; +} - /* Destroys the current endpoint, if any. */ - if (kibnal_data.kib_cep) { - retval = cm_cancel(kibnal_data.kib_cep); - if (retval) - CERROR ("Error %d stopping listener\n", retval); - - retval = cm_destroy_cep(kibnal_data.kib_cep); - if (retval) - CERROR ("Error %d destroying CEP\n", retval); - - kibnal_data.kib_cep = NULL; - } - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (kibnal_data.kib_nid == PTL_NID_ANY) { - /* No new NID to install. The driver is shuting down. */ - up (&kibnal_data.kib_nid_mutex); - return (0); +int +kibnal_start_listener (lnet_ni_t *ni) +{ + static cm_listen_data_t info; + + cm_return_t cmrc; + + LASSERT (kibnal_data.kib_listen_handle == NULL); + + kibnal_data.kib_listen_handle = + cm_create_cep(cm_cep_transp_rc); + if (kibnal_data.kib_listen_handle == NULL) { + CERROR ("Can't create listen CEP\n"); + return -ENOMEM; } - /* remove any previous advert (crashed node etc) */ - kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); + CDEBUG(D_NET, "Created CEP %p for listening\n", + kibnal_data.kib_listen_handle); - kibnal_data.kib_cep = cm_create_cep(cm_cep_transp_rc); - if (kibnal_data.kib_cep == NULL) { - CERROR ("Can't create CEP\n"); - rc = -ENOMEM; - } else { - cm_return_t cmret; - cm_listen_data_t info; + memset(&info, 0, sizeof(info)); + info.listen_addr.end_pt.sid = + (__u64)(*kibnal_tunables.kib_service_number); - CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_cep); + cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, + kibnal_listen_callback, NULL); + if (cmrc == cm_stat_success) + return 0; - memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = kibnal_data.kib_service_id; + CERROR ("cm_listen error: %d\n", cmrc); - cmret = cm_listen(kibnal_data.kib_cep, &info, - kibnal_listen_callback, NULL); - if (cmret) { - CERROR ("cm_listen error: %d\n", cmret); - rc = -EINVAL; - } else { - rc = 0; - } - } - - if (rc == 0) { - rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_SET, NULL, NULL); - if (rc == 0) { -#ifdef IBNAL_CHECK_ADVERT - kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_GET, NULL, NULL); -#endif - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - retval = cm_cancel (kibnal_data.kib_cep); - if (retval) - CERROR("cm_cancel failed: %d\n", retval); + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + LASSERT (cmrc == cm_stat_success); + + kibnal_data.kib_listen_handle = NULL; + return -EINVAL; +} + +void +kibnal_stop_listener(lnet_ni_t *ni) +{ + cm_return_t cmrc; - retval = cm_destroy_cep (kibnal_data.kib_cep); - if (retval) - CERROR("cm_destroy_cep failed: %d\n", retval); + LASSERT (kibnal_data.kib_listen_handle != NULL); - /* remove any peers that sprung up while I failed to - * advertise myself */ - kibnal_del_peer (PTL_NID_ANY, 0); - } + cmrc = cm_cancel(kibnal_data.kib_listen_handle); + if (cmrc != cm_stat_success) + CERROR ("Error %d stopping listener\n", cmrc); - kibnal_data.kib_nid = PTL_NID_ANY; - up (&kibnal_data.kib_nid_mutex); - return (rc); + cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ + + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + if (cmrc != vv_return_ok) + CERROR ("Error %d destroying CEP\n", cmrc); + + kibnal_data.kib_listen_handle = NULL; } -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) +int +kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) { - kib_peer_t *peer; + kib_peer_t *peer; + unsigned long flags; + int rc; - LASSERT (nid != PTL_NID_ANY); + LASSERT (nid != LNET_NID_ANY); - PORTAL_ALLOC(peer, sizeof (*peer)); + LIBCFS_ALLOC(peer, sizeof (*peer)); if (peer == NULL) { - CERROR("Canot allocate perr\n"); - return (NULL); + CERROR("Cannot allocate peer\n"); + return -ENOMEM; } memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -336,38 +519,62 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_conns); INIT_LIST_HEAD (&peer->ibp_tx_queue); - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_error = 0; + peer->ibp_last_alive = cfs_time_current(); + peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ - atomic_inc (&kibnal_data.kib_npeers); - return (peer); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (atomic_read(&kibnal_data.kib_npeers) >= + *kibnal_tunables.kib_concurrent_peers) { + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } else if (kibnal_data.kib_listen_handle == NULL) { + rc = -ESHUTDOWN; /* shutdown has started */ + } else { + rc = 0; + /* npeers only grows with the global lock held */ + atomic_inc(&kibnal_data.kib_npeers); + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Can't create peer: %s\n", + (rc == -ESHUTDOWN) ? "shutting down" : + "too many peers"); + LIBCFS_FREE(peer, sizeof(*peer)); + } else { + *peerp = peer; + } + + return rc; } void kibnal_destroy_peer (kib_peer_t *peer) { - LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); + LASSERT (peer->ibp_accepting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); - - PORTAL_FREE (peer, sizeof (*peer)); + + LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&kibnal_data.kib_npeers); + atomic_dec(&kibnal_data.kib_npeers); } -/* the caller is responsible for accounting for the additional reference - * that this creates */ kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) +kibnal_find_peer_locked (lnet_nid_t nid) { + /* the caller is responsible for accounting the additional reference + * that this creates */ struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; kib_peer_t *peer; @@ -378,32 +585,20 @@ kibnal_find_peer_locked (ptl_nid_t nid) LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); /* active conn */ if (peer->ibp_nid != nid) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_nid2str(nid), + atomic_read (&peer->ibp_refcount)); return (peer); } return (NULL); } -kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - read_lock (&kibnal_data.kib_global_lock); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - kib_peer_addref(peer); - read_unlock (&kibnal_data.kib_global_lock); - - return (peer); -} - void kibnal_unlink_peer_locked (kib_peer_t *peer) { @@ -413,17 +608,19 @@ kibnal_unlink_peer_locked (kib_peer_t *peer) LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - kib_peer_decref(peer); + kibnal_peer_decref(peer); } -static int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +int +kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, + int *persistencep) { kib_peer_t *peer; struct list_head *ptmp; int i; + unsigned long flags; - read_lock (&kibnal_data.kib_global_lock); + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { @@ -432,42 +629,53 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (index-- > 0) continue; *nidp = peer->ibp_nid; + *ipp = peer->ibp_ip; *persistencep = peer->ibp_persistence; - read_unlock (&kibnal_data.kib_global_lock); + read_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); return (0); } } - read_unlock (&kibnal_data.kib_global_lock); + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (-ENOENT); } -static int -kibnal_add_persistent_peer (ptl_nid_t nid) +int +kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) { - unsigned long flags; kib_peer_t *peer; kib_peer_t *peer2; - - if (nid == PTL_NID_ANY) + unsigned long flags; + int rc; + + CDEBUG(D_NET, "%s at %u.%u.%u.%u\n", + libcfs_nid2str(nid), HIPQUAD(ip)); + + if (nid == LNET_NID_ANY) return (-EINVAL); - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) + return rc; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + /* I'm always called with a reference on kibnal_data.kib_ni + * so shutdown can't have started */ + LASSERT (kibnal_data.kib_listen_handle != NULL); peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - kib_peer_decref (peer); + kibnal_peer_decref (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ @@ -475,51 +683,53 @@ kibnal_add_persistent_peer (ptl_nid_t nid) kibnal_nid2peerlist (nid)); } + peer->ibp_ip = ip; peer->ibp_persistence++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (0); } -static void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +void +kibnal_del_peer_locked (kib_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; + peer->ibp_persistence = 0; - if (peer->ibp_persistence != 0) - return; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); + if (list_empty(&peer->ibp_conns)) { + kibnal_unlink_peer_locked(peer); + } else { + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); - kibnal_close_conn_locked (conn, 0); + kibnal_close_conn_locked (conn, 0); + } + /* NB peer is no longer persistent; closing its last conn + * unlinked it. */ } - - /* NB peer unlinks itself when last conn is closed */ + /* NB peer now unlinked; might even be freed if the peer table had the + * last ref on it. */ } int -kibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (lnet_nid_t nid) { - unsigned long flags; + CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; int lo; int hi; int i; + unsigned long flags; int rc = -ENOENT; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -531,25 +741,31 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) continue; - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); - if (single_share) - goto out; + list_splice_init(&peer->ibp_tx_queue, &zombies); + } + + kibnal_del_peer_locked (peer); + rc = 0; /* matched something */ } } - out: - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + kibnal_txlist_done(&zombies, -EIO); return (rc); } -static kib_conn_t * +kib_conn_t * kibnal_get_conn_by_idx (int index) { kib_peer_t *peer; @@ -557,8 +773,9 @@ kibnal_get_conn_by_idx (int index) kib_conn_t *conn; struct list_head *ctmp; int i; + unsigned long flags; - read_lock (&kibnal_data.kib_global_lock); + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { list_for_each (ptmp, &kibnal_data.kib_peers[i]) { @@ -566,6 +783,7 @@ kibnal_get_conn_by_idx (int index) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); list_for_each (ctmp, &peer->ibp_conns) { @@ -573,35 +791,204 @@ kibnal_get_conn_by_idx (int index) continue; conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - read_unlock (&kibnal_data.kib_global_lock); + kibnal_conn_addref(conn); + read_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); return (conn); } } } - read_unlock (&kibnal_data.kib_global_lock); + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (NULL); } +void +kibnal_debug_rx (kib_rx_t *rx) +{ + CDEBUG(D_CONSOLE, " %p nob %d msg_type %x " + "cred %d seq "LPD64"\n", + rx, rx->rx_nob, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq); +} + +void +kibnal_debug_tx (kib_tx_t *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " + "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, tx->tx_deadline, tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits, + tx->tx_msg->ibm_seq); +} + +void +kibnal_debug_conn (kib_conn_t *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", + atomic_read(&conn->ibc_refcount), conn, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n", + conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state); + CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n", + conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " disc %d comms_err %d\n", + conn->ibc_disconnect, conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBNAL_RX_MSGS; i++) + kibnal_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + +int +kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) +{ + static vv_qp_attr_t attr; + + kib_connvars_t *cv = conn->ibc_connvars; + vv_return_t vvrc; + + /* Only called by connd => static OK */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + + memset(&attr, 0, sizeof(attr)); + + switch (new_state) { + default: + LBUG(); + + case vv_qp_state_init: { + struct vv_qp_modify_init_st *init = &attr.modify.params.init; + + init->p_key_indx = cv->cv_pkey_index; + init->phy_port_num = cv->cv_port; + init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */ + init->access_control = vv_acc_r_mem_read | + vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */ + + attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | + VV_QP_AT_PHY_PORT_NUM | + VV_QP_AT_ACCESS_CON_F; + break; + } + case vv_qp_state_rtr: { + struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr; + vv_add_vec_t *av = &rtr->remote_add_vec; + + av->dlid = cv->cv_path.dlid; + av->grh_flag = (!IBNAL_LOCAL_SUB); + av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate); + av->service_level = cv->cv_path.sl; + av->source_path_bit = IBNAL_SOURCE_PATH_BIT; + av->pmtu = cv->cv_path.mtu; + av->rnr_retry_count = cv->cv_rnr_count; + av->global_dest.traffic_class = cv->cv_path.traffic_class; + av->global_dest.hope_limit = cv->cv_path.hop_limut; + av->global_dest.flow_lable = cv->cv_path.flow_label; + av->global_dest.s_gid_index = cv->cv_sgid_index; + // XXX other av fields zero? + + rtr->destanation_qp = cv->cv_remote_qpn; + rtr->receive_psn = cv->cv_rxpsn; + rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD; + rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer; + + + // XXX sdp sets VV_QP_AT_OP_F but no actual optional options + attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | + VV_QP_AT_DEST_QP | + VV_QP_AT_R_PSN | + VV_QP_AT_MIN_RNR_NAK_T | + VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | + VV_QP_AT_OP_F; + break; + } + case vv_qp_state_rts: { + struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts; + + rts->send_psn = cv->cv_txpsn; + rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout; + rts->retry_num = *kibnal_tunables.kib_retry_cnt; + rts->rnr_num = *kibnal_tunables.kib_rnr_cnt; + rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; + + attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | + VV_QP_AT_L_ACK_T | + VV_QP_AT_RETRY_NUM | + VV_QP_AT_RNR_NUM | + VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; + break; + } + case vv_qp_state_error: + case vv_qp_state_reset: + attr.modify.vv_qp_attr_mask = 0; + break; + } + + attr.modify.qp_modify_into_state = new_state; + attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE; + + vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); + if (vvrc != vv_return_ok) { + CERROR("Can't modify qp -> %s state to %d: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + new_state, vvrc); + return -EIO; + } + + return 0; +} + kib_conn_t * -kibnal_create_conn (void) +kibnal_create_conn (cm_cep_handle_t cep) { - kib_conn_t *conn; - int i; - __u64 vaddr = 0; - __u64 vaddr_base; - int page_offset; - int ipage; - vv_qp_attr_t qp_attr; - vv_return_t retval; - int rc; - void *qp_context; - - PORTAL_ALLOC(conn, sizeof (*conn)); + kib_conn_t *conn; + int i; + int page_offset; + int ipage; + vv_return_t vvrc; + int rc; + + static vv_qp_attr_t reqattr; + static vv_qp_attr_t rspattr; + + /* Only the connd creates conns => single threaded */ + LASSERT(!in_interrupt()); + LASSERT(current == kibnal_data.kib_connd); + + LIBCFS_ALLOC(conn, sizeof (*conn)); if (conn == NULL) { CERROR ("Can't allocate connection\n"); return (NULL); @@ -610,14 +997,31 @@ kibnal_create_conn (void) /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); + conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */ + + INIT_LIST_HEAD (&conn->ibc_early_rxs); + INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); - + atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + conn->ibc_cep = cep; + + LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed; + } + memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); + /* Random seed for QP sequence number */ + get_random_bytes(&conn->ibc_connvars->cv_rxpsn, + sizeof(conn->ibc_connvars->cv_rxpsn)); + + LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) { CERROR("Cannot allocate RX buffers\n"); goto failed; @@ -628,39 +1032,27 @@ kibnal_create_conn (void) if (rc != 0) goto failed; - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + vv_mem_reg_h_t mem_h; + vv_r_key_t r_key; rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - if (kibnal_whole_mem()) { - void *newaddr; - vv_mem_reg_h_t mem_h; - vv_r_key_t r_key; - - /* Voltaire stack already registers the whole - * memory, so use that API. */ - retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - rx->rx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &rx->l_key, - &r_key); - if (retval) { - CERROR("vv_get_gen_mr_attrib failed: %d", retval); - /* TODO: free pages? */ - goto failed; - } - } - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + rx->rx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &rx->rx_lkey, + &r_key); + LASSERT (vvrc == vv_return_ok); + + CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, + rx->rx_msg, rx->rx_lkey); + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -671,54 +1063,51 @@ kibnal_create_conn (void) } } - qp_attr = (vv_qp_attr_t) { - .create.qp_type = vv_qp_type_r_conn, - .create.cq_send_h = kibnal_data.kib_cq, - .create.cq_receive_h = kibnal_data.kib_cq, - .create.send_max_outstand_wr = IBNAL_TX_MAX_SG * - IBNAL_MSG_QUEUE_SIZE, - .create.receive_max_outstand_wr = IBNAL_MSG_QUEUE_SIZE, - .create.max_scatgat_per_send_wr = 1, - .create.max_scatgat_per_receive_wr = 1, - .create.signaling_type = vv_selectable_signaling, /* TODO: correct? */ - .create.pd_h = kibnal_data.kib_pd, - .create.recv_solicited_events = vv_signal_all, - }; - retval = vv_qp_create(kibnal_data.kib_hca, &qp_attr, NULL, - &conn->ibc_qp, &conn->ibc_qp_attrs); - if (retval != 0) { - CERROR ("Failed to create queue pair: %d\n", retval); + memset(&reqattr, 0, sizeof(reqattr)); + + reqattr.create.qp_type = vv_qp_type_r_conn; + reqattr.create.cq_send_h = kibnal_data.kib_cq; + reqattr.create.cq_receive_h = kibnal_data.kib_cq; + reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * + (*kibnal_tunables.kib_concurrent_sends); + reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; + reqattr.create.max_scatgat_per_send_wr = 1; + reqattr.create.max_scatgat_per_receive_wr = 1; + reqattr.create.signaling_type = vv_selectable_signaling; + reqattr.create.pd_h = kibnal_data.kib_pd; + reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all; + + vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL, + &conn->ibc_qp, &rspattr); + if (vvrc != vv_return_ok) { + CERROR ("Failed to create queue pair: %d\n", vvrc); goto failed; } /* Mark QP created */ conn->ibc_state = IBNAL_CONN_INIT_QP; - - qp_attr = (vv_qp_attr_t) { - .modify.qp_modify_into_state = vv_qp_state_init, - .modify.vv_qp_attr_mask = VV_QP_AT_STATE | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_P_KEY_IX | VV_QP_AT_ACCESS_CON_F, - .modify.qp_type = vv_qp_type_r_conn, - - .modify.params.init.p_key_indx = 0, - .modify.params.init.phy_port_num = kibnal_data.kib_port, - .modify.params.init.access_control = vv_acc_r_mem_write | vv_acc_r_mem_read, - }; - retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs); - if (retval != 0) { - CERROR ("Failed to modify queue pair: %d\n", retval); + conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; + + if (rspattr.create_return.receive_max_outstand_wr < + IBNAL_RX_MSGS || + rspattr.create_return.send_max_outstand_wr < + (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) { + CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", + IBNAL_RX_MSGS, + (1 + IBNAL_MAX_RDMA_FRAGS) * + (*kibnal_tunables.kib_concurrent_sends), + rspattr.create_return.receive_max_outstand_wr, + rspattr.create_return.send_max_outstand_wr); goto failed; } - retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); - if (retval) { - CERROR ("Failed to query queue pair: %d\n", retval); - goto failed; - } + /* Mark init complete */ + conn->ibc_state = IBNAL_CONN_INIT; /* 1 ref for caller */ atomic_set (&conn->ibc_refcount, 1); return (conn); - + failed: kibnal_destroy_conn (conn); return (NULL); @@ -727,91 +1116,67 @@ kibnal_create_conn (void) void kibnal_destroy_conn (kib_conn_t *conn) { - vv_return_t retval; - + vv_return_t vvrc; + + /* Only the connd does this (i.e. single threaded) */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + CDEBUG (D_NET, "connection %p\n", conn); LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_early_rxs)); LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + case IBNAL_CONN_DISCONNECTED: - /* called after connection sequence initiated */ + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); + /* fall through */ + + case IBNAL_CONN_INIT: + vvrc = cm_destroy_cep(conn->ibc_cep); + LASSERT (vvrc == vv_return_ok); /* fall through */ case IBNAL_CONN_INIT_QP: - /* _destroy includes an implicit Reset of the QP which - * discards posted work */ - retval = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); - if (retval) - CERROR("Can't destroy QP: %d\n", retval); + kibnal_set_qp_state(conn, vv_qp_state_reset); + vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); + if (vvrc != vv_return_ok) + CERROR("Can't destroy QP: %d\n", vvrc); /* fall through */ - + case IBNAL_CONN_INIT_NOTHING: break; - - default: - LASSERT (0); - } - - if (conn->ibc_cep != NULL) { - retval = cm_destroy_cep(conn->ibc_cep); - if (retval) - CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, - retval); } - if (conn->ibc_rx_pages != NULL) + if (conn->ibc_rx_pages != NULL) kibnal_free_pages(conn->ibc_rx_pages); - + if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, + LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + if (conn->ibc_peer != NULL) - kib_peer_decref(conn->ibc_peer); + kibnal_peer_decref(conn->ibc_peer); - PORTAL_FREE(conn, sizeof (*conn)); + LIBCFS_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_connd_waitq); - } } -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* must disconnect before dropping the final ref */ - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); -} - -static int +int kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { kib_conn_t *conn; @@ -843,9 +1208,10 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) if (conn->ibc_incarnation == incarnation) continue; - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->ibp_nid, conn->ibc_incarnation, incarnation); - + CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_incarnation, incarnation); + count++; kibnal_close_conn_locked (conn, -ESTALE); } @@ -853,21 +1219,21 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) return (count); } -static int -kibnal_close_matching_conns (ptl_nid_t nid) +int +kibnal_close_matching_conns (lnet_nid_t nid) { - unsigned long flags; kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; int hi; int i; + unsigned long flags; int count = 0; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -880,118 +1246,109 @@ kibnal_close_matching_conns (ptl_nid_t nid) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) continue; count += kibnal_close_peer_conns_locked (peer, 0); } } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (0); - + return (count == 0 ? -ENOENT : 0); } -static int -kibnal_cmd(struct portals_cfg *pcfg, void * private) +int +kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { - int rc = -EINVAL; - ENTRY; - - LASSERT (pcfg != NULL); - - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + LASSERT (ni == kibnal_data.kib_ni); + + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + __u32 ip = 0; + int share_count = 0; + + rc = kibnal_get_peer_info(data->ioc_count, + &nid, &ip, &share_count); + data->ioc_nid = nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */ break; } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + case IOC_LIBCFS_ADD_PEER: { + rc = kibnal_add_persistent_peer (data->ioc_nid, + data->ioc_u32[0]); /* IP */ break; } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); + case IOC_LIBCFS_DEL_PEER: { + rc = kibnal_del_peer (data->ioc_nid); break; } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); if (conn == NULL) rc = -ENOENT; else { + // kibnal_debug_conn(conn); rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); + data->ioc_nid = conn->ibc_peer->ibp_nid; + kibnal_conn_decref(conn); } break; } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (data->ioc_nid); break; } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) + case IOC_LIBCFS_REGISTER_MYNID: { + if (ni->ni_nid == data->ioc_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); + } break; } } - RETURN(rc); + return rc; } void kibnal_free_pages (kib_pages_t *p) { - int npages = p->ibp_npages; - vv_return_t retval; - int i; - - if (p->ibp_mapped) { - retval = vv_mem_region_destroy(kibnal_data.kib_hca, p->ibp_handle); - if (retval != 0) - CERROR ("Deregister error: %d\n", retval); - } - + int npages = p->ibp_npages; + int i; + for (i = 0; i < npages; i++) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); - - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); + + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) { kib_pages_t *p; - vv_phy_list_t phys_pages; - vv_phy_buf_t *phys_buf; int i; - vv_return_t retval; - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); @@ -999,7 +1356,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); p->ibp_npages = npages; - + for (i = 0; i < npages; i++) { p->ibp_pages[i] = alloc_page (GFP_KERNEL); if (p->ibp_pages[i] == NULL) { @@ -1009,125 +1366,179 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) } } - if (kibnal_whole_mem()) - goto out; + *pp = p; + return (0); +} - PORTAL_ALLOC(phys_buf, npages * sizeof(vv_phy_buf_t)); - if (phys_buf == NULL) { - CERROR ("Can't allocate phys_buf for %d pages\n", npages); - /* XXX free ibp_pages? */ - kibnal_free_pages(p); - return (-ENOMEM); +int +kibnal_alloc_tx_descs (void) +{ + int i; + + LIBCFS_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) + return -ENOMEM; + + memset(kibnal_data.kib_tx_descs, 0, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); + + for (i = 0; i < IBNAL_TX_MSGS(); i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + +#if IBNAL_USE_FMR + LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + return -ENOMEM; +#else + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + if (tx->tx_gl == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + return -ENOMEM; +#endif } - phys_pages.number_of_buff = npages; - phys_pages.phy_list = phys_buf; + return 0; +} - /* if we were using the _contig_ registration variant we would have - * an array of PhysAddr/Length pairs, but the discontiguous variant - * just takes the PhysAddr */ - for (i = 0; i < npages; i++) { - phys_buf[i].start = kibnal_page2phys(p->ibp_pages[i]); - phys_buf[i].size = PAGE_SIZE; - } - - retval = vv_phy_mem_region_register(kibnal_data.kib_hca, - &phys_pages, - 0, /* requested vaddr */ - npages * PAGE_SIZE, - 0, /* offset */ - kibnal_data.kib_pd, - vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */ - &p->ibp_handle, &p->ibp_vaddr, - &p->ibp_lkey, &p->ibp_rkey); - - PORTAL_FREE(phys_buf, npages * sizeof(vv_phy_buf_t)); - - if (retval) { - CERROR ("Error %d mapping %d pages\n", retval, npages); - kibnal_free_pages(p); - return (-ENOMEM); +void +kibnal_free_tx_descs (void) +{ + int i; + + if (kibnal_data.kib_tx_descs == NULL) + return; + + for (i = 0; i < IBNAL_TX_MSGS(); i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + +#if IBNAL_USE_FMR + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); +#else + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + + if (tx->tx_gl != NULL) + LIBCFS_FREE(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); +#endif } - CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" " - "lkey %x rkey %x\n", npages, p->ibp_handle, - p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); - - p->ibp_mapped = 1; -out: - *pp = p; - return (0); + LIBCFS_FREE(kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); +} + +#if IBNAL_USE_FMR +void +kibnal_free_fmrs (int n) +{ + int i; + vv_return_t vvrc; + kib_tx_t *tx; + + for (i = 0; i < n; i++) { + tx = &kibnal_data.kib_tx_descs[i]; + + vvrc = vv_free_fmr(kibnal_data.kib_hca, + tx->tx_md.md_fmrhandle); + if (vvrc != vv_return_ok) + CWARN("vv_free_fmr[%d]: %d\n", i, vvrc); + } } +#endif -static int +int kibnal_setup_tx_descs (void) { - int ipage = 0; - int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; - struct page *page; - kib_tx_t *tx; - int i; - int rc; + int ipage = 0; + int page_offset = 0; + struct page *page; + kib_tx_t *tx; + vv_mem_reg_h_t mem_h; + vv_r_key_t rkey; + vv_return_t vvrc; + int i; + int rc; +#if IBNAL_USE_FMR + vv_fmr_t fmr_props; +#endif /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, - 0); + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES(), 0); if (rc != 0) return (rc); - /* ignored for the whole_mem case */ - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - if (kibnal_whole_mem()) { - void *newaddr; - vv_mem_reg_h_t mem_h; - vv_return_t retval; - - /* Voltaire stack already registers the whole - * memory, so use that API. */ - retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - tx->tx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &tx->l_key, - &tx->r_key); - if (retval) { - CERROR("vv_get_gen_mr_attrib failed: %d", retval); - /* TODO: free pages? */ - /* TODO: return. */ - } +#if IBNAL_USE_FMR + memset(&fmr_props, 0, sizeof(fmr_props)); + fmr_props.pd_hndl = kibnal_data.kib_pd; + fmr_props.acl = (vv_acc_r_mem_write | + vv_acc_l_mem_write); + fmr_props.max_pages = LNET_MAX_IOV; + fmr_props.log2_page_sz = PAGE_SHIFT; + fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps; + + vvrc = vv_alloc_fmr(kibnal_data.kib_hca, + &fmr_props, + &tx->tx_md.md_fmrhandle); + if (vvrc != vv_return_ok) { + CERROR("Can't allocate fmr %d: %d\n", i, vvrc); + kibnal_free_fmrs(i); + kibnal_free_pages (kibnal_data.kib_tx_pages); + return -ENOMEM; } - tx->tx_isnblk = (i >= IBNAL_NTX); - tx->tx_mapped = KIB_TX_UNMAPPED; + tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; + tx->tx_md.md_active = 0; +#endif + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); - CDEBUG(D_NET, "Tx[%d] %p->%p\n", i, tx, tx->tx_msg); + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + tx->tx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &tx->tx_lkey, + &rkey); + LASSERT (vvrc == vv_return_ok); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, + tx->tx_msg, tx->tx_lkey); - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1135,123 +1546,92 @@ kibnal_setup_tx_descs (void) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } - + return (0); } -static void -kibnal_api_shutdown (nal_t *nal) +void +kibnal_shutdown (lnet_ni_t *ni) { - int i; - int rc; - vv_return_t retval; + int i; + vv_return_t vvrc; - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } + LASSERT (ni == kibnal_data.kib_ni); + LASSERT (ni->ni_data == &kibnal_data); CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &kibnal_api); + atomic_read (&libcfs_kmemory)); switch (kibnal_data.kib_init) { case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(VIBNAL); - /* No new peers */ + /* stop accepting connections and prevent new peers */ + kibnal_stop_listener(ni); - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kibnal_set_mynid (PTL_NID_ANY); + /* nuke all existing peers */ + kibnal_del_peer(LNET_NID_ANY); - /* Wait for all peer state to clean up (crazy) */ + /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { + while (atomic_read(&kibnal_data.kib_npeers) != 0) { i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect (can take a few seconds)\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ + "waiting for %d peers to disconnect\n", + atomic_read(&kibnal_data.kib_npeers)); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ case IBNAL_INIT_CQ: - retval = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); - if (retval) - CERROR ("Destroy CQ error: %d\n", retval); + vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); + if (vvrc != vv_return_ok) + CERROR ("Destroy CQ error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_TXD: kibnal_free_pages (kibnal_data.kib_tx_pages); +#if IBNAL_USE_FMR + kibnal_free_fmrs(IBNAL_TX_MSGS()); +#endif /* fall through */ -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif case IBNAL_INIT_PD: -#if IBNAL_WHOLE_MEM==0 - retval = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd); - if (retval != 0) - CERROR ("Destroy PD error: %d\n", retval); +#if 0 + /* Only deallocate a PD if we actually allocated one */ + vvrc = vv_pd_deallocate(kibnal_data.kib_hca, + kibnal_data.kib_pd); + if (vvrc != vv_return_ok) + CERROR ("Destroy PD error: %d\n", vvrc); #endif /* fall through */ - case IBNAL_INIT_GSI: - retval = gsi_deregister_class(kibnal_data.gsi_handle); - if (retval != 0) - CERROR ("GSI deregister failed: %d\n", retval); - /* fall through */ - - case IBNAL_INIT_GSI_POOL: - gsi_dtgrm_pool_destroy(kibnal_data.gsi_pool_handle); - /* fall through */ - - case IBNAL_INIT_PORT: - /* XXX ??? */ - /* fall through */ - case IBNAL_INIT_ASYNC: - retval = vv_dell_async_event_cb (kibnal_data.kib_hca, - kibnal_ca_async_callback); - if (retval) - CERROR("deregister asynchronous call back error: %d\n", retval); - - /* fall through */ + vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca, + kibnal_async_callback); + if (vvrc != vv_return_ok) + CERROR("vv_dell_async_event_cb error: %d\n", vvrc); - case IBNAL_INIT_HCA: - retval = vv_hca_close(kibnal_data.kib_hca); - if (retval != 0) - CERROR ("Close HCA error: %d\n", retval); /* fall through */ - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); + case IBNAL_INIT_HCA: + vvrc = vv_hca_close(kibnal_data.kib_hca); + if (vvrc != vv_return_ok) + CERROR ("Close HCA error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); } LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs)); LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ @@ -1265,78 +1645,130 @@ kibnal_api_shutdown (nal_t *nal) CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ - + case IBNAL_INIT_NOTHING: break; } - if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + kibnal_free_tx_descs(); if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * + LIBCFS_FREE (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + atomic_read (&libcfs_kmemory)); kibnal_data.kib_init = IBNAL_INIT_NOTHING; + PORTAL_MODULE_UNUSE; } -#define roundup_power(val, power) \ - ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) - -/* this isn't very portable or sturdy in the face of funny mem/bus configs */ -static __u64 max_phys_mem(void) -{ - struct sysinfo si; - __u64 ret; - - si_meminfo(&si); - ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; - return roundup_power(ret, 128 * 1024 * 1024); -} -#undef roundup_power - -static int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +int +kibnal_startup (lnet_ni_t *ni) { - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; + char scratch[32]; + char ipif_name[32]; + char *hca_name; + __u32 ip; + __u32 netmask; + int up; + int nob; + int devno; + struct timeval tv; + int rc; + int i; vv_request_event_record_t req_er; - vv_return_t retval; + vv_return_t vvrc; - LASSERT (nal == &kibnal_api); + LASSERT (ni->ni_lnd == &the_kiblnd); - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); + /* Only 1 instance supported */ + if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; + } + + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; } - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; - init_MUTEX (&kibnal_data.kib_nid_mutex); - kibnal_data.kib_nid = PTL_NID_ANY; + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[0] != NULL) { + /* Use the HCA specified in 'networks=' */ + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + return -EPERM; + } + + /* Parse */ + hca_name = ni->ni_interfaces[0]; + nob = strlen(*kibnal_tunables.kib_hca_basename); + + if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) || + sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) { + CERROR("Unrecognised HCA %s\n", hca_name); + return -EINVAL; + } + + } else { + /* Use 0 */ + devno = 0; + + hca_name = scratch; + snprintf(hca_name, sizeof(scratch), "%s%d", + *kibnal_tunables.kib_hca_basename, devno); + if (strlen(hca_name) == sizeof(scratch) - 1) { + CERROR("HCA name %s truncated\n", hca_name); + return -EINVAL; + } + } + + /* Find IP address from */ + snprintf(ipif_name, sizeof(ipif_name), "%s%d", + *kibnal_tunables.kib_ipif_basename, devno); + if (strlen(ipif_name) == sizeof(ipif_name) - 1) { + CERROR("IPoIB interface name %s truncated\n", ipif_name); + return -EINVAL; + } + + rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); + return -ENETDOWN; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); + + PORTAL_MODULE_USE; + memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ + + kibnal_data.kib_ni = ni; + ni->ni_data = &kibnal_data; + + do_gettimeofday(&tv); + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; rwlock_init(&kibnal_data.kib_global_lock); kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, + LIBCFS_ALLOC (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); if (kibnal_data.kib_peers == NULL) { goto failed; @@ -1346,26 +1778,20 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kibnal_data.kib_connd_lock); INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs); INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); init_waitqueue_head (&kibnal_data.kib_connd_waitq); spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); init_waitqueue_head (&kibnal_data.kib_sched_waitq); spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - INIT_LIST_HEAD (&kibnal_data.gsi_pending); - init_MUTEX (&kibnal_data.gsi_mutex); - - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); + rc = kibnal_alloc_tx_descs(); + if (rc != 0) { + CERROR("Can't allocate tx descs\n"); goto failed; } @@ -1373,22 +1799,8 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ - process_id.pid = requested_pid; - process_id.nid = kibnal_data.kib_nid; - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn vibnal scheduler[%d]: %d\n", i, rc); @@ -1402,10 +1814,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, goto failed; } - /* TODO: apparently only one adapter is supported */ - retval = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca); - if (retval) { - CERROR ("Can't open CA: %d\n", retval); + vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca); + if (vvrc != vv_return_ok) { + CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc); goto failed; } @@ -1414,12 +1825,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /* register to get HCA's asynchronous events. */ req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK; - retval = vv_set_async_event_cb (kibnal_data.kib_hca, - req_er, - kibnal_ca_async_callback); - - if (retval) { - CERROR ("Can't open CA: %d\n", retval); + vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er, + kibnal_async_callback); + if (vvrc != vv_return_ok) { + CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc); goto failed; } @@ -1427,10 +1836,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /*****************************************************/ - retval = vv_hca_query(kibnal_data.kib_hca, - &kibnal_data.kib_hca_attrs); - if (retval) { - CERROR ("Can't size port attrs: %d\n", retval); + vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs); + if (vvrc != vv_return_ok) { + CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc); goto failed; } @@ -1442,9 +1850,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, u_int32_t tbl_count; vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr; - retval = vv_port_query(kibnal_data.kib_hca, port_num, pattr); - if (retval) { - CERROR("vv_port_query failed for port %d: %d\n", port_num, retval); + vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr); + if (vvrc != vv_return_ok) { + CERROR("vv_port_query failed for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } @@ -1462,155 +1871,110 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_NET, "port[%d] Active\n", port_num); /* Found a suitable port. Get its GUID and PKEY. */ - kibnal_data.kib_port = port_num; - tbl_count = 1; - retval = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); - if (retval) { - CERROR("vv_get_port_gid_tbl failed for port %d: %d\n", port_num, retval); + vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, + port_num, &tbl_count, + &kibnal_data.kib_port_gid); + if (vvrc != vv_return_ok) { + CERROR("vv_get_port_gid_tbl failed " + "for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } tbl_count = 1; - retval = vv_get_port_partition_tbl (kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey); - if (retval) { - CERROR("vv_get_port_partition_tbl failed for port %d: %d\n", port_num, retval); + vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, + port_num, &tbl_count, + &kibnal_data.kib_port_pkey); + if (vvrc != vv_return_ok) { + CERROR("vv_get_port_partition_tbl failed " + "for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } + kibnal_data.kib_port = port_num; + break; case vv_state_linkActDefer: /* TODO: correct? */ case vv_state_linkNoChange: - CERROR("Unexpected port[%d] state %d\n", - i, pattr->port_state); + CERROR("Unexpected %s port[%d] state %d\n", + hca_name, i, pattr->port_state); continue; } break; } if (kibnal_data.kib_port == -1) { - CERROR ("Can't find an active port\n"); + CERROR ("Can't find an active port on %s\n", hca_name); goto failed; } - CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n", - kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); - CDEBUG(D_NET, "got guid "LPX64"\n", cpu_to_le64(kibnal_data.kib_port_gid.scope.g.eui64)); - - /* Active port found */ - kibnal_data.kib_init = IBNAL_INIT_PORT; - /*****************************************************/ + CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n", + hca_name, kibnal_data.kib_port, + kibnal_data.kib_port_gid.scope.g.subnet, + kibnal_data.kib_port_gid.scope.g.eui64); - /* Prepare things to be able to send/receive MADS */ - retval = gsi_dtgrm_pool_create(IBNAL_CONCURRENT_PEERS, &kibnal_data.gsi_pool_handle); - if (retval) { - CERROR("Could not create GSI pool: %d\n", retval); - goto failed; - } - kibnal_data.kib_init = IBNAL_INIT_GSI_POOL; - - retval = gsi_register_class(MAD_CLASS_SUBN_ADM, /* TODO: correct? */ - 2, /* version */ - "ANY_HCA", -#ifdef GSI_PASS_PORT_NUM - kibnal_data.kib_port, -#endif - 0, 0, - vibnal_mad_sent_cb, vibnal_mad_received_cb, - NULL, &kibnal_data.gsi_handle); - if (retval) { - CERROR("Cannot register GSI class: %d\n", retval); - goto failed; - } - - kibnal_data.kib_init = IBNAL_INIT_GSI; /*****************************************************/ -#if IBNAL_WHOLE_MEM==0 - retval = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); +#if 1 + /* We use a pre-allocated PD */ + vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); #else - retval = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); + vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); #endif - if (retval) { - CERROR ("Can't create PD: %d\n", retval); + if (vvrc != vv_return_ok) { + CERROR ("Can't init PD: %d\n", vvrc); goto failed; } - + /* flag PD initialised */ kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if IBNAL_FMR - { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - - /*****************************************************/ - rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); goto failed; } - + /* flag TX descs initialised */ kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ + { - uint32_t nentries; - - retval = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, - kibnal_ca_callback, - NULL, /* context */ - &kibnal_data.kib_cq, &nentries); - if (retval) { - CERROR ("Can't create RX CQ: %d\n", retval); + __u32 nentries; + + vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), + kibnal_cq_callback, + NULL, /* context */ + &kibnal_data.kib_cq, &nentries); + if (vvrc != 0) { + CERROR ("Can't create RX CQ: %d\n", vvrc); goto failed; } /* flag CQ initialised */ kibnal_data.kib_init = IBNAL_INIT_CQ; - if (nentries < IBNAL_CQ_ENTRIES) { - CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES); + if (nentries < IBNAL_CQ_ENTRIES()) { + CERROR ("CQ only has %d entries, need %d\n", + nentries, IBNAL_CQ_ENTRIES()); goto failed; } - retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); - if (retval != 0) { + vvrc = vv_request_completion_notification(kibnal_data.kib_hca, + kibnal_data.kib_cq, + vv_next_solicit_unsolicit_event); + if (vvrc != 0) { CERROR ("Failed to re-arm completion queue: %d\n", rc); goto failed; } } - - /*****************************************************/ - rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL); + rc = kibnal_start_listener(ni); if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); + CERROR("Can't start listener: %d\n", rc); goto failed; } @@ -1618,27 +1982,19 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ - printk(KERN_INFO "Lustre: Voltaire IB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); + return (0); failed: - CDEBUG(D_NET, "kibnal_api_startup failed\n"); - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); + CDEBUG(D_NET, "kibnal_startup failed\n"); + kibnal_shutdown (ni); + return (-ENETDOWN); } void __exit kibnal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(VIBNAL); + lnet_unregister_lnd(&the_kiblnd); + kibnal_tunables_fini(); } int __init @@ -1646,48 +2002,31 @@ kibnal_module_init (void) { int rc; - if (sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len) { - CERROR("sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len\n"); - return -EINVAL; - } - - /* the following must be sizeof(int) for proc_dointvec() */ - if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { - CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); - return -EINVAL; - } - - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + vibnal_assert_wire_constants(); + + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + <= cm_REQ_priv_data_len); + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + <= cm_REP_priv_data_len); + CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE); +#if !IBNAL_USE_FMR + CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) + <= IBNAL_MSG_SIZE); + CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) + <= IBNAL_MSG_SIZE); +#endif + rc = kibnal_tunables_init(); + if (rc != 0) + return rc; - rc = ptl_register_nal(VIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } + lnet_register_lnd(&the_kiblnd); - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(VIBNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); + return 0; } -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01"); +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00"); MODULE_LICENSE("GPL"); module_init(kibnal_module_init); module_exit(kibnal_module_fini); -