/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * * Copyright (C) 2004 Cluster File Systems, Inc. * Author: Eric Barton * Author: Frank Zago * * This file is part of Lustre, http://www.lustre.org. * * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ #include "viblnd.h" lnd_t the_kiblnd = { .lnd_type = VIBLND, .lnd_startup = kibnal_startup, .lnd_shutdown = kibnal_shutdown, .lnd_ctl = kibnal_ctl, .lnd_send = kibnal_send, .lnd_recv = kibnal_recv, .lnd_eager_recv = kibnal_eager_recv, }; kib_data_t kibnal_data; void vibnal_assert_wire_constants (void) { /* Wire protocol assertions generated by 'wirecheck' * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */ /* Constants... */ CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91); CLASSERT (IBNAL_MSG_VERSION == 0x11); CLASSERT (IBNAL_MSG_CONNREQ == 0xc0); CLASSERT (IBNAL_MSG_CONNACK == 0xc1); CLASSERT (IBNAL_MSG_NOOP == 0xd0); CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1); CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2); CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3); CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4); CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5); CLASSERT (IBNAL_MSG_GET_REQ == 0xd6); CLASSERT (IBNAL_MSG_GET_DONE == 0xd7); /* Checks for struct kib_connparams_t */ CLASSERT ((int)sizeof(kib_connparams_t) == 12); CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0); CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4); CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4); CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4); CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8); CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4); /* Checks for struct kib_immediate_msg_t */ CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72); CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0); CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72); CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85); CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1); CLASSERT (IBNAL_USE_FMR == 1); /* Checks for struct kib_rdma_desc_t */ CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16); CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0); CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8); CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8); CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4); CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12); CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4); /* Checks for struct kib_putreq_msg_t */ CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80); CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0); CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72); CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72); CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8); /* Checks for struct kib_putack_msg_t */ CLASSERT ((int)sizeof(kib_putack_msg_t) == 32); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0); CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8); CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16); CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16); /* Checks for struct kib_get_msg_t */ CLASSERT ((int)sizeof(kib_get_msg_t) == 96); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0); CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72); CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80); CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16); /* Checks for struct kib_completion_msg_t */ CLASSERT ((int)sizeof(kib_completion_msg_t) == 12); CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0); CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8); CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8); CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4); /* Checks for struct kib_msg_t */ CLASSERT ((int)sizeof(kib_msg_t) == 152); CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4); CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2); CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1); CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1); CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4); CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4); CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8); CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8); CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8); CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8); CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); } __u32 kibnal_cksum (void *ptr, int nob) { char *c = ptr; __u32 sum = 0; while (nob-- > 0) sum = ((sum << 1) | (sum >> 31)) + *c++; /* ensure I don't return 0 (== no checksum) */ return (sum == 0) ? 1 : sum; } void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) { msg->ibm_type = type; msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; } void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, lnet_nid_t dstnid, __u64 dststamp, __u64 seq) { /* CAVEAT EMPTOR! all message fields not set here should have been * initialised previously. */ msg->ibm_magic = IBNAL_MSG_MAGIC; msg->ibm_version = version; /* ibm_type */ msg->ibm_credits = credits; /* ibm_nob */ msg->ibm_cksum = 0; msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid, dstnid); msg->ibm_srcstamp = kibnal_data.kib_incarnation; msg->ibm_dstnid = dstnid; msg->ibm_dststamp = dststamp; msg->ibm_seq = seq; if (*kibnal_tunables.kib_cksum) { /* NB ibm_cksum zero while computing cksum */ msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); } } int kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) { const int hdr_size = offsetof(kib_msg_t, ibm_u); __u32 msg_cksum; __u32 msg_version; int flip; int msg_nob; #if !IBNAL_USE_FMR int i; int n; #endif /* 6 bytes are enough to have received magic + version */ if (nob < 6) { CERROR("Short message: %d\n", nob); return -EPROTO; } /* Future protocol version compatibility support! * If the viblnd-specific protocol changes, or when LNET unifies * protocols over all LNDs, the initial connection will negotiate a * protocol version. If I find this, I avoid any console errors. If * my is doing connection establishment, the reject will tell the peer * which version I'm running. */ if (msg->ibm_magic == IBNAL_MSG_MAGIC) { flip = 0; } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { flip = 1; } else { if (msg->ibm_magic == LNET_PROTO_MAGIC || msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) return -EPROTO; /* Completely out to lunch */ CERROR("Bad magic: %08x\n", msg->ibm_magic); return -EPROTO; } msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; if (expected_version == 0) { if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && msg_version != IBNAL_MSG_VERSION) return -EPROTO; } else if (msg_version != expected_version) { CERROR("Bad version: %x(%x expected)\n", msg_version, expected_version); return -EPROTO; } if (nob < hdr_size) { CERROR("Short message: %d\n", nob); return -EPROTO; } msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; if (msg_nob > nob) { CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); return -EPROTO; } /* checksum must be computed with ibm_cksum zero and BEFORE anything * gets flipped */ msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; msg->ibm_cksum = 0; if (msg_cksum != 0 && msg_cksum != kibnal_cksum(msg, msg_nob)) { CERROR("Bad checksum\n"); return -EPROTO; } msg->ibm_cksum = msg_cksum; if (flip) { /* leave magic unflipped as a clue to peer endianness */ msg->ibm_version = msg_version; CLASSERT (sizeof(msg->ibm_type) == 1); CLASSERT (sizeof(msg->ibm_credits) == 1); msg->ibm_nob = msg_nob; __swab64s(&msg->ibm_srcnid); __swab64s(&msg->ibm_srcstamp); __swab64s(&msg->ibm_dstnid); __swab64s(&msg->ibm_dststamp); __swab64s(&msg->ibm_seq); } if (msg->ibm_srcnid == LNET_NID_ANY) { CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); return -EPROTO; } switch (msg->ibm_type) { default: CERROR("Unknown message type %x\n", msg->ibm_type); return -EPROTO; case IBNAL_MSG_NOOP: break; case IBNAL_MSG_IMMEDIATE: if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); return -EPROTO; } break; case IBNAL_MSG_PUT_REQ: if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.putreq))); return -EPROTO; } break; case IBNAL_MSG_PUT_ACK: if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.putack))); return -EPROTO; } #if IBNAL_USE_FMR if (flip) { __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); } #else if (flip) { __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); } n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", n, IBNAL_MAX_RDMA_FRAGS); return -EPROTO; } if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); return -EPROTO; } if (flip) { for (i = 0; i < n; i++) { __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi); } } #endif break; case IBNAL_MSG_GET_REQ: if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { CERROR("Short GET_REQ: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.get))); return -EPROTO; } #if IBNAL_USE_FMR if (flip) { __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); } #else if (flip) { __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); } n = msg->ibm_u.get.ibgm_rd.rd_nfrag; if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", n, IBNAL_MAX_RDMA_FRAGS); return -EPROTO; } if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { CERROR("Short GET_REQ: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); return -EPROTO; } if (flip) for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi); } #endif break; case IBNAL_MSG_PUT_NAK: case IBNAL_MSG_PUT_DONE: case IBNAL_MSG_GET_DONE: if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { CERROR("Short RDMA completion: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.completion))); return -EPROTO; } if (flip) __swab32s(&msg->ibm_u.completion.ibcm_status); break; case IBNAL_MSG_CONNREQ: case IBNAL_MSG_CONNACK: if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { CERROR("Short connreq/ack: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.connparams))); return -EPROTO; } if (flip) { __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); } break; } return 0; } int kibnal_start_listener (lnet_ni_t *ni) { static cm_listen_data_t info; cm_return_t cmrc; LASSERT (kibnal_data.kib_listen_handle == NULL); kibnal_data.kib_listen_handle = cm_create_cep(cm_cep_transp_rc); if (kibnal_data.kib_listen_handle == NULL) { CERROR ("Can't create listen CEP\n"); return -ENOMEM; } CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_listen_handle); memset(&info, 0, sizeof(info)); info.listen_addr.end_pt.sid = (__u64)(*kibnal_tunables.kib_service_number); cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, kibnal_listen_callback, NULL); if (cmrc == cm_stat_success) return 0; CERROR ("cm_listen error: %d\n", cmrc); cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); LASSERT (cmrc == cm_stat_success); kibnal_data.kib_listen_handle = NULL; return -EINVAL; } void kibnal_stop_listener(lnet_ni_t *ni) { cm_return_t cmrc; LASSERT (kibnal_data.kib_listen_handle != NULL); cmrc = cm_cancel(kibnal_data.kib_listen_handle); if (cmrc != cm_stat_success) CERROR ("Error %d stopping listener\n", cmrc); cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); if (cmrc != vv_return_ok) CERROR ("Error %d destroying CEP\n", cmrc); kibnal_data.kib_listen_handle = NULL; } int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) { kib_peer_t *peer; unsigned long flags; int rc; LASSERT (nid != LNET_NID_ANY); LIBCFS_ALLOC(peer, sizeof (*peer)); if (peer == NULL) { CERROR("Cannot allocate peer\n"); return -ENOMEM; } memset(peer, 0, sizeof(*peer)); /* zero flags etc */ peer->ibp_nid = nid; atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ INIT_LIST_HEAD (&peer->ibp_conns); INIT_LIST_HEAD (&peer->ibp_tx_queue); peer->ibp_error = 0; peer->ibp_last_alive = cfs_time_current(); peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (atomic_read(&kibnal_data.kib_npeers) >= *kibnal_tunables.kib_concurrent_peers) { rc = -EOVERFLOW; /* !! but at least it distinguishes */ } else if (kibnal_data.kib_listen_handle == NULL) { rc = -ESHUTDOWN; /* shutdown has started */ } else { rc = 0; /* npeers only grows with the global lock held */ atomic_inc(&kibnal_data.kib_npeers); } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (rc != 0) { CERROR("Can't create peer: %s\n", (rc == -ESHUTDOWN) ? "shutting down" : "too many peers"); LIBCFS_FREE(peer, sizeof(*peer)); } else { *peerp = peer; } return rc; } void kibnal_destroy_peer (kib_peer_t *peer) { LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); LASSERT (peer->ibp_accepting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ atomic_dec(&kibnal_data.kib_npeers); } kib_peer_t * kibnal_find_peer_locked (lnet_nid_t nid) { /* the caller is responsible for accounting the additional reference * that this creates */ struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; kib_peer_t *peer; list_for_each (tmp, peer_list) { peer = list_entry (tmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); /* active conn */ if (peer->ibp_nid != nid) continue; CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", peer, libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount)); return (peer); } return (NULL); } void kibnal_unlink_peer_locked (kib_peer_t *peer) { LASSERT (peer->ibp_persistence == 0); LASSERT (list_empty(&peer->ibp_conns)); LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ kibnal_peer_decref(peer); } int kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *persistencep) { kib_peer_t *peer; struct list_head *ptmp; int i; unsigned long flags; read_lock_irqsave(&kibnal_data.kib_global_lock, flags); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { list_for_each (ptmp, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (index-- > 0) continue; *nidp = peer->ibp_nid; *ipp = peer->ibp_ip; *persistencep = peer->ibp_persistence; read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (0); } } read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (-ENOENT); } int kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) { kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; int rc; CDEBUG(D_NET, "%s at %u.%u.%u.%u\n", libcfs_nid2str(nid), HIPQUAD(ip)); if (nid == LNET_NID_ANY) return (-EINVAL); rc = kibnal_create_peer(&peer, nid); if (rc != 0) return rc; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); /* I'm always called with a reference on kibnal_data.kib_ni * so shutdown can't have started */ LASSERT (kibnal_data.kib_listen_handle != NULL); peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { kibnal_peer_decref (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist (nid)); } peer->ibp_ip = ip; peer->ibp_persistence++; write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (0); } void kibnal_del_peer_locked (kib_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; peer->ibp_persistence = 0; if (list_empty(&peer->ibp_conns)) { kibnal_unlink_peer_locked(peer); } else { list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { conn = list_entry(ctmp, kib_conn_t, ibc_list); kibnal_close_conn_locked (conn, 0); } /* NB peer is no longer persistent; closing its last conn * unlinked it. */ } /* NB peer now unlinked; might even be freed if the peer table had the * last ref on it. */ } int kibnal_del_peer (lnet_nid_t nid) { CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; int lo; int hi; int i; unsigned long flags; int rc = -ENOENT; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) continue; if (!list_empty(&peer->ibp_tx_queue)) { LASSERT (list_empty(&peer->ibp_conns)); list_splice_init(&peer->ibp_tx_queue, &zombies); } kibnal_del_peer_locked (peer); rc = 0; /* matched something */ } } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); kibnal_txlist_done(&zombies, -EIO); return (rc); } kib_conn_t * kibnal_get_conn_by_idx (int index) { kib_peer_t *peer; struct list_head *ptmp; kib_conn_t *conn; struct list_head *ctmp; int i; unsigned long flags; read_lock_irqsave(&kibnal_data.kib_global_lock, flags); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { list_for_each (ptmp, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); list_for_each (ctmp, &peer->ibp_conns) { if (index-- > 0) continue; conn = list_entry (ctmp, kib_conn_t, ibc_list); kibnal_conn_addref(conn); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (conn); } } } read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (NULL); } void kibnal_debug_rx (kib_rx_t *rx) { CDEBUG(D_CONSOLE, " %p nob %d msg_type %x " "cred %d seq "LPD64"\n", rx, rx->rx_nob, rx->rx_msg->ibm_type, rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq); } void kibnal_debug_tx (kib_tx_t *tx) { CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n", tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, tx->tx_status, tx->tx_deadline, tx->tx_cookie, tx->tx_lntmsg[0] == NULL ? "-" : "!", tx->tx_lntmsg[1] == NULL ? "-" : "!", tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits, tx->tx_msg->ibm_seq); } void kibnal_debug_conn (kib_conn_t *conn) { struct list_head *tmp; int i; spin_lock(&conn->ibc_lock); CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", atomic_read(&conn->ibc_refcount), conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n", conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state); CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n", conn->ibc_nsends_posted, conn->ibc_credits, conn->ibc_outstanding_credits, conn->ibc_reserved_credits); CDEBUG(D_CONSOLE, " disc %d comms_err %d\n", conn->ibc_disconnect, conn->ibc_comms_error); CDEBUG(D_CONSOLE, " early_rxs:\n"); list_for_each(tmp, &conn->ibc_early_rxs) kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); list_for_each(tmp, &conn->ibc_tx_queue_nocred) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); CDEBUG(D_CONSOLE, " tx_queue:\n"); list_for_each(tmp, &conn->ibc_tx_queue) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); CDEBUG(D_CONSOLE, " active_txs:\n"); list_for_each(tmp, &conn->ibc_active_txs) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); CDEBUG(D_CONSOLE, " rxs:\n"); for (i = 0; i < IBNAL_RX_MSGS; i++) kibnal_debug_rx(&conn->ibc_rxs[i]); spin_unlock(&conn->ibc_lock); } int kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) { static vv_qp_attr_t attr; kib_connvars_t *cv = conn->ibc_connvars; vv_return_t vvrc; /* Only called by connd => static OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); memset(&attr, 0, sizeof(attr)); switch (new_state) { default: LBUG(); case vv_qp_state_init: { struct vv_qp_modify_init_st *init = &attr.modify.params.init; init->p_key_indx = cv->cv_pkey_index; init->phy_port_num = cv->cv_port; init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */ init->access_control = vv_acc_r_mem_read | vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */ attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_ACCESS_CON_F; break; } case vv_qp_state_rtr: { struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr; vv_add_vec_t *av = &rtr->remote_add_vec; av->dlid = cv->cv_path.dlid; av->grh_flag = (!IBNAL_LOCAL_SUB); av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate); av->service_level = cv->cv_path.sl; av->source_path_bit = IBNAL_SOURCE_PATH_BIT; av->pmtu = cv->cv_path.mtu; av->rnr_retry_count = cv->cv_rnr_count; av->global_dest.traffic_class = cv->cv_path.traffic_class; av->global_dest.hope_limit = cv->cv_path.hop_limut; av->global_dest.flow_lable = cv->cv_path.flow_label; av->global_dest.s_gid_index = cv->cv_sgid_index; // XXX other av fields zero? rtr->destanation_qp = cv->cv_remote_qpn; rtr->receive_psn = cv->cv_rxpsn; rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD; rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer; // XXX sdp sets VV_QP_AT_OP_F but no actual optional options attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | VV_QP_AT_DEST_QP | VV_QP_AT_R_PSN | VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | VV_QP_AT_OP_F; break; } case vv_qp_state_rts: { struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts; rts->send_psn = cv->cv_txpsn; rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout; rts->retry_num = *kibnal_tunables.kib_retry_cnt; rts->rnr_num = *kibnal_tunables.kib_rnr_cnt; rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | VV_QP_AT_L_ACK_T | VV_QP_AT_RETRY_NUM | VV_QP_AT_RNR_NUM | VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; break; } case vv_qp_state_error: case vv_qp_state_reset: attr.modify.vv_qp_attr_mask = 0; break; } attr.modify.qp_modify_into_state = new_state; attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE; vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); if (vvrc != vv_return_ok) { CERROR("Can't modify qp -> %s state to %d: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), new_state, vvrc); return -EIO; } return 0; } kib_conn_t * kibnal_create_conn (cm_cep_handle_t cep) { kib_conn_t *conn; int i; int page_offset; int ipage; vv_return_t vvrc; int rc; static vv_qp_attr_t reqattr; static vv_qp_attr_t rspattr; /* Only the connd creates conns => single threaded */ LASSERT(!in_interrupt()); LASSERT(current == kibnal_data.kib_connd); LIBCFS_ALLOC(conn, sizeof (*conn)); if (conn == NULL) { CERROR ("Can't allocate connection\n"); return (NULL); } /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */ INIT_LIST_HEAD (&conn->ibc_early_rxs); INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); INIT_LIST_HEAD (&conn->ibc_tx_queue); INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ conn->ibc_cep = cep; LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); if (conn->ibc_connvars == NULL) { CERROR("Can't allocate in-progress connection state\n"); goto failed; } memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); /* Random seed for QP sequence number */ get_random_bytes(&conn->ibc_connvars->cv_rxpsn, sizeof(conn->ibc_connvars->cv_rxpsn)); LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) { CERROR("Cannot allocate RX buffers\n"); goto failed; } memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); if (rc != 0) goto failed; for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; kib_rx_t *rx = &conn->ibc_rxs[i]; vv_mem_reg_h_t mem_h; vv_r_key_t r_key; rx->rx_conn = conn; rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, rx->rx_msg, IBNAL_MSG_SIZE, &mem_h, &rx->rx_lkey, &r_key); LASSERT (vvrc == vv_return_ok); CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, rx->rx_msg, rx->rx_lkey); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; LASSERT (ipage <= IBNAL_RX_MSG_PAGES); } } memset(&reqattr, 0, sizeof(reqattr)); reqattr.create.qp_type = vv_qp_type_r_conn; reqattr.create.cq_send_h = kibnal_data.kib_cq; reqattr.create.cq_receive_h = kibnal_data.kib_cq; reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends); reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; reqattr.create.max_scatgat_per_send_wr = 1; reqattr.create.max_scatgat_per_receive_wr = 1; reqattr.create.signaling_type = vv_selectable_signaling; reqattr.create.pd_h = kibnal_data.kib_pd; reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all; vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL, &conn->ibc_qp, &rspattr); if (vvrc != vv_return_ok) { CERROR ("Failed to create queue pair: %d\n", vvrc); goto failed; } /* Mark QP created */ conn->ibc_state = IBNAL_CONN_INIT_QP; conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; if (rspattr.create_return.receive_max_outstand_wr < IBNAL_RX_MSGS || rspattr.create_return.send_max_outstand_wr < (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) { CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", IBNAL_RX_MSGS, (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends), rspattr.create_return.receive_max_outstand_wr, rspattr.create_return.send_max_outstand_wr); goto failed; } /* Mark init complete */ conn->ibc_state = IBNAL_CONN_INIT; /* 1 ref for caller */ atomic_set (&conn->ibc_refcount, 1); return (conn); failed: kibnal_destroy_conn (conn); return (NULL); } void kibnal_destroy_conn (kib_conn_t *conn) { vv_return_t vvrc; /* Only the connd does this (i.e. single threaded) */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); CDEBUG (D_NET, "connection %p\n", conn); LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_early_rxs)); LASSERT (list_empty(&conn->ibc_tx_queue)); LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); switch (conn->ibc_state) { default: /* conn must be completely disengaged from the network */ LBUG(); case IBNAL_CONN_DISCONNECTED: /* connvars should have been freed already */ LASSERT (conn->ibc_connvars == NULL); /* fall through */ case IBNAL_CONN_INIT: vvrc = cm_destroy_cep(conn->ibc_cep); LASSERT (vvrc == vv_return_ok); /* fall through */ case IBNAL_CONN_INIT_QP: kibnal_set_qp_state(conn, vv_qp_state_reset); vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); if (vvrc != vv_return_ok) CERROR("Can't destroy QP: %d\n", vvrc); /* fall through */ case IBNAL_CONN_INIT_NOTHING: break; } if (conn->ibc_rx_pages != NULL) kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_connvars != NULL) LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); if (conn->ibc_peer != NULL) kibnal_peer_decref(conn->ibc_peer); LIBCFS_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); } int kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); count++; kibnal_close_conn_locked (conn, why); } return (count); } int kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) { kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); if (conn->ibc_incarnation == incarnation) continue; CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", libcfs_nid2str(peer->ibp_nid), conn->ibc_incarnation, incarnation); count++; kibnal_close_conn_locked (conn, -ESTALE); } return (count); } int kibnal_close_matching_conns (lnet_nid_t nid) { kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; int hi; int i; unsigned long flags; int count = 0; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) continue; count += kibnal_close_peer_conns_locked (peer, 0); } } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ if (nid == LNET_NID_ANY) return (0); return (count == 0 ? -ENOENT : 0); } int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { struct libcfs_ioctl_data *data = arg; int rc = -EINVAL; LASSERT (ni == kibnal_data.kib_ni); switch(cmd) { case IOC_LIBCFS_GET_PEER: { lnet_nid_t nid = 0; __u32 ip = 0; int share_count = 0; rc = kibnal_get_peer_info(data->ioc_count, &nid, &ip, &share_count); data->ioc_nid = nid; data->ioc_count = share_count; data->ioc_u32[0] = ip; data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */ break; } case IOC_LIBCFS_ADD_PEER: { rc = kibnal_add_persistent_peer (data->ioc_nid, data->ioc_u32[0]); /* IP */ break; } case IOC_LIBCFS_DEL_PEER: { rc = kibnal_del_peer (data->ioc_nid); break; } case IOC_LIBCFS_GET_CONN: { kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); if (conn == NULL) rc = -ENOENT; else { // kibnal_debug_conn(conn); rc = 0; data->ioc_nid = conn->ibc_peer->ibp_nid; kibnal_conn_decref(conn); } break; } case IOC_LIBCFS_CLOSE_CONNECTION: { rc = kibnal_close_matching_conns (data->ioc_nid); break; } case IOC_LIBCFS_REGISTER_MYNID: { if (ni->ni_nid == data->ioc_nid) { rc = 0; } else { CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", libcfs_nid2str(data->ioc_nid), libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; } break; } } return rc; } void kibnal_free_pages (kib_pages_t *p) { int npages = p->ibp_npages; int i; for (i = 0; i < npages; i++) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) { kib_pages_t *p; int i; LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); } memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); p->ibp_npages = npages; for (i = 0; i < npages; i++) { p->ibp_pages[i] = alloc_page (GFP_KERNEL); if (p->ibp_pages[i] == NULL) { CERROR ("Can't allocate page %d of %d\n", i, npages); kibnal_free_pages(p); return (-ENOMEM); } } *pp = p; return (0); } int kibnal_alloc_tx_descs (void) { int i; LIBCFS_ALLOC (kibnal_data.kib_tx_descs, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_tx_descs == NULL) return -ENOMEM; memset(kibnal_data.kib_tx_descs, 0, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * sizeof(*tx->tx_pages)); if (tx->tx_pages == NULL) return -ENOMEM; #else LIBCFS_ALLOC(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_wrq == NULL) return -ENOMEM; LIBCFS_ALLOC(tx->tx_gl, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_gl == NULL) return -ENOMEM; LIBCFS_ALLOC(tx->tx_rd, offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); if (tx->tx_rd == NULL) return -ENOMEM; #endif } return 0; } void kibnal_free_tx_descs (void) { int i; if (kibnal_data.kib_tx_descs == NULL) return; for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR if (tx->tx_pages != NULL) LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * sizeof(*tx->tx_pages)); #else if (tx->tx_wrq != NULL) LIBCFS_FREE(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_gl != NULL) LIBCFS_FREE(tx->tx_gl, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_rd != NULL) LIBCFS_FREE(tx->tx_rd, offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); #endif } LIBCFS_FREE(kibnal_data.kib_tx_descs, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); } #if IBNAL_USE_FMR void kibnal_free_fmrs (int n) { int i; vv_return_t vvrc; kib_tx_t *tx; for (i = 0; i < n; i++) { tx = &kibnal_data.kib_tx_descs[i]; vvrc = vv_free_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle); if (vvrc != vv_return_ok) CWARN("vv_free_fmr[%d]: %d\n", i, vvrc); } } #endif int kibnal_setup_tx_descs (void) { int ipage = 0; int page_offset = 0; struct page *page; kib_tx_t *tx; vv_mem_reg_h_t mem_h; vv_r_key_t rkey; vv_return_t vvrc; int i; int rc; #if IBNAL_USE_FMR vv_fmr_t fmr_props; #endif /* pre-mapped messages are not bigger than 1 page */ CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES(), 0); if (rc != 0) return (rc); for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR memset(&fmr_props, 0, sizeof(fmr_props)); fmr_props.pd_hndl = kibnal_data.kib_pd; fmr_props.acl = (vv_acc_r_mem_write | vv_acc_l_mem_write); fmr_props.max_pages = LNET_MAX_IOV; fmr_props.log2_page_sz = PAGE_SHIFT; fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps; vvrc = vv_alloc_fmr(kibnal_data.kib_hca, &fmr_props, &tx->tx_md.md_fmrhandle); if (vvrc != vv_return_ok) { CERROR("Can't allocate fmr %d: %d\n", i, vvrc); kibnal_free_fmrs(i); kibnal_free_pages (kibnal_data.kib_tx_pages); return -ENOMEM; } tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; tx->tx_md.md_active = 0; #endif tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, tx->tx_msg, IBNAL_MSG_SIZE, &mem_h, &tx->tx_lkey, &rkey); LASSERT (vvrc == vv_return_ok); CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, tx->tx_msg, tx->tx_lkey); list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } return (0); } void kibnal_shutdown (lnet_ni_t *ni) { int i; vv_return_t vvrc; LASSERT (ni == kibnal_data.kib_ni); LASSERT (ni->ni_data == &kibnal_data); CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&libcfs_kmemory)); switch (kibnal_data.kib_init) { case IBNAL_INIT_ALL: /* stop accepting connections and prevent new peers */ kibnal_stop_listener(ni); /* nuke all existing peers */ kibnal_del_peer(LNET_NID_ANY); /* Wait for all peer state to clean up */ i = 2; while (atomic_read(&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ "waiting for %d peers to disconnect\n", atomic_read(&kibnal_data.kib_npeers)); cfs_pause(cfs_time_seconds(1)); } /* fall through */ case IBNAL_INIT_CQ: vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); if (vvrc != vv_return_ok) CERROR ("Destroy CQ error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_TXD: kibnal_free_pages (kibnal_data.kib_tx_pages); #if IBNAL_USE_FMR kibnal_free_fmrs(IBNAL_TX_MSGS()); #endif /* fall through */ case IBNAL_INIT_PD: #if 0 /* Only deallocate a PD if we actually allocated one */ vvrc = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd); if (vvrc != vv_return_ok) CERROR ("Destroy PD error: %d\n", vvrc); #endif /* fall through */ case IBNAL_INIT_ASYNC: vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca, kibnal_async_callback); if (vvrc != vv_return_ok) CERROR("vv_dell_async_event_cb error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_HCA: vvrc = vv_hca_close(kibnal_data.kib_hca); if (vvrc != vv_return_ok) CERROR ("Close HCA error: %d\n", vvrc); /* fall through */ case IBNAL_INIT_DATA: LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); } LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); LASSERT (list_empty (&kibnal_data.kib_connd_conns)); LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs)); LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ kibnal_data.kib_shutdown = 1; wake_up_all (&kibnal_data.kib_sched_waitq); wake_up_all (&kibnal_data.kib_connd_waitq); i = 2; while (atomic_read (&kibnal_data.kib_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read (&kibnal_data.kib_nthreads)); cfs_pause(cfs_time_seconds(1)); } /* fall through */ case IBNAL_INIT_NOTHING: break; } kibnal_free_tx_descs(); if (kibnal_data.kib_peers != NULL) LIBCFS_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&libcfs_kmemory)); kibnal_data.kib_init = IBNAL_INIT_NOTHING; PORTAL_MODULE_UNUSE; } int kibnal_startup (lnet_ni_t *ni) { char scratch[32]; char ipif_name[32]; char *hca_name; __u32 ip; __u32 netmask; int up; int nob; int devno; struct timeval tv; int rc; int i; vv_request_event_record_t req_er; vv_return_t vvrc; LASSERT (ni->ni_lnd == &the_kiblnd); /* Only 1 instance supported */ if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { CERROR ("Only 1 instance supported\n"); return -EPERM; } if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { CERROR ("Can't set credits(%d) > ntx(%d)\n", *kibnal_tunables.kib_credits, *kibnal_tunables.kib_ntx); return -EINVAL; } ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; CLASSERT (LNET_MAX_INTERFACES > 1); if (ni->ni_interfaces[0] != NULL) { /* Use the HCA specified in 'networks=' */ if (ni->ni_interfaces[1] != NULL) { CERROR("Multiple interfaces not supported\n"); return -EPERM; } /* Parse */ hca_name = ni->ni_interfaces[0]; nob = strlen(*kibnal_tunables.kib_hca_basename); if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) || sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) { CERROR("Unrecognised HCA %s\n", hca_name); return -EINVAL; } } else { /* Use 0 */ devno = 0; hca_name = scratch; snprintf(hca_name, sizeof(scratch), "%s%d", *kibnal_tunables.kib_hca_basename, devno); if (strlen(hca_name) == sizeof(scratch) - 1) { CERROR("HCA name %s truncated\n", hca_name); return -EINVAL; } } /* Find IP address from */ snprintf(ipif_name, sizeof(ipif_name), "%s%d", *kibnal_tunables.kib_ipif_basename, devno); if (strlen(ipif_name) == sizeof(ipif_name) - 1) { CERROR("IPoIB interface name %s truncated\n", ipif_name); return -EINVAL; } rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); if (rc != 0) { CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); return -ENETDOWN; } if (!up) { CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); return -ENETDOWN; } ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); PORTAL_MODULE_USE; memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ kibnal_data.kib_ni = ni; ni->ni_data = &kibnal_data; do_gettimeofday(&tv); kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; rwlock_init(&kibnal_data.kib_global_lock); kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; LIBCFS_ALLOC (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); if (kibnal_data.kib_peers == NULL) { goto failed; } for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); spin_lock_init (&kibnal_data.kib_connd_lock); INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs); INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); init_waitqueue_head (&kibnal_data.kib_connd_waitq); spin_lock_init (&kibnal_data.kib_sched_lock); init_waitqueue_head (&kibnal_data.kib_sched_waitq); spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); rc = kibnal_alloc_tx_descs(); if (rc != 0) { CERROR("Can't allocate tx descs\n"); goto failed; } /* lists/ptrs/locks initialised */ kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ for (i = 0; i < IBNAL_N_SCHED; i++) { rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i)); if (rc != 0) { CERROR("Can't spawn vibnal scheduler[%d]: %d\n", i, rc); goto failed; } } rc = kibnal_thread_start (kibnal_connd, NULL); if (rc != 0) { CERROR ("Can't spawn vibnal connd: %d\n", rc); goto failed; } vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca); if (vvrc != vv_return_ok) { CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc); goto failed; } /* Channel Adapter opened */ kibnal_data.kib_init = IBNAL_INIT_HCA; /* register to get HCA's asynchronous events. */ req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK; vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er, kibnal_async_callback); if (vvrc != vv_return_ok) { CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc); goto failed; } kibnal_data.kib_init = IBNAL_INIT_ASYNC; /*****************************************************/ vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs); if (vvrc != vv_return_ok) { CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc); goto failed; } kibnal_data.kib_port = -1; for (i = 0; iport_state) { case vv_state_linkDoun: CDEBUG(D_NET, "port[%d] Down\n", port_num); continue; case vv_state_linkInit: CDEBUG(D_NET, "port[%d] Init\n", port_num); continue; case vv_state_linkArm: CDEBUG(D_NET, "port[%d] Armed\n", port_num); continue; case vv_state_linkActive: CDEBUG(D_NET, "port[%d] Active\n", port_num); /* Found a suitable port. Get its GUID and PKEY. */ tbl_count = 1; vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); if (vvrc != vv_return_ok) { CERROR("vv_get_port_gid_tbl failed " "for %s port %d: %d\n", hca_name, port_num, vvrc); continue; } tbl_count = 1; vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey); if (vvrc != vv_return_ok) { CERROR("vv_get_port_partition_tbl failed " "for %s port %d: %d\n", hca_name, port_num, vvrc); continue; } kibnal_data.kib_port = port_num; break; case vv_state_linkActDefer: /* TODO: correct? */ case vv_state_linkNoChange: CERROR("Unexpected %s port[%d] state %d\n", hca_name, i, pattr->port_state); continue; } break; } if (kibnal_data.kib_port == -1) { CERROR ("Can't find an active port on %s\n", hca_name); goto failed; } CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n", hca_name, kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); /*****************************************************/ #if 1 /* We use a pre-allocated PD */ vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); #else vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); #endif if (vvrc != vv_return_ok) { CERROR ("Can't init PD: %d\n", vvrc); goto failed; } /* flag PD initialised */ kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); goto failed; } /* flag TX descs initialised */ kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ { __u32 nentries; vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), kibnal_cq_callback, NULL, /* context */ &kibnal_data.kib_cq, &nentries); if (vvrc != 0) { CERROR ("Can't create RX CQ: %d\n", vvrc); goto failed; } /* flag CQ initialised */ kibnal_data.kib_init = IBNAL_INIT_CQ; if (nentries < IBNAL_CQ_ENTRIES()) { CERROR ("CQ only has %d entries, need %d\n", nentries, IBNAL_CQ_ENTRIES()); goto failed; } vvrc = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); if (vvrc != 0) { CERROR ("Failed to re-arm completion queue: %d\n", rc); goto failed; } } rc = kibnal_start_listener(ni); if (rc != 0) { CERROR("Can't start listener: %d\n", rc); goto failed; } /* flag everything initialised */ kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ return (0); failed: CDEBUG(D_NET, "kibnal_startup failed\n"); kibnal_shutdown (ni); return (-ENETDOWN); } void __exit kibnal_module_fini (void) { lnet_unregister_lnd(&the_kiblnd); kibnal_tunables_fini(); } int __init kibnal_module_init (void) { int rc; vibnal_assert_wire_constants(); CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) <= cm_REQ_priv_data_len); CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) <= cm_REP_priv_data_len); CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE); #if !IBNAL_USE_FMR CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); #endif rc = kibnal_tunables_init(); if (rc != 0) return rc; lnet_register_lnd(&the_kiblnd); return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00"); MODULE_LICENSE("GPL"); module_init(kibnal_module_init); module_exit(kibnal_module_fini);