--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t kibnal_api;
+ptl_handle_ni_t kibnal_ni;
+kib_tunables_t kibnal_tunables;
+
+kib_data_t kibnal_data = {
+ .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL 202
+
+#define IBNAL_SYSCTL_TIMEOUT 1
+
+static ctl_table kibnal_ctl_table[] = {
+ {IBNAL_SYSCTL_TIMEOUT, "timeout",
+ &kibnal_tunables.kib_io_timeout, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+ {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+ { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+ char name[32];
+
+ if (service == NULL)
+ {
+ CWARN("tag : %s\n"
+ "status : %d (NULL)\n", tag, rc);
+ return;
+ }
+ strncpy (name, service->ServiceName, sizeof(name)-1);
+ name[sizeof(name)-1] = 0;
+
+ CWARN("tag : %s\n"
+ "status : %d\n"
+ "service id: "LPX64"\n"
+ "name : %s\n"
+ "NID : "LPX64"\n", tag, rc,
+ service->RID.ServiceID, name,
+ *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+ FSTATUS frc, uint32 madrc)
+{
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry,
+ QUERY_RESULT_VALUES *qry_result)
+{
+ FSTATUS frc = qry_result->Status;
+
+ if (frc != FSUCCESS &&
+ qry_result->ResultDataSize == 0)
+ frc = FERROR;
+
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+ QUERY *qry;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ PORTAL_ALLOC(qry, sizeof(*qry));
+ if (qry == NULL)
+ return;
+
+ memset (qry, 0, sizeof(*qry));
+ qry->InputType = InputTypeServiceRecord;
+ qry->OutputType = OutputTypeServiceRecord;
+ qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+ svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ qry,
+ kibnal_service_query_done,
+ NULL, &frc2);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d checking SM service\n", frc);
+ } else {
+ down (&kibnal_data.kib_nid_signal);
+ frc = frc2;
+
+ if (frc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ }
+
+ return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+ IB_SERVICE_RECORD *svc;
+
+ memset (fod, 0, sizeof(*fod));
+ fod->Type = type;
+
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+ svc->RID.ServiceID = kibnal_data.kib_service_id;
+ svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+ svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+ svc->ServiceLease = 0xffffffff;
+
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return (-ENOMEM);
+
+ fill_fod(fod, FabOpSetServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
+ svc->RID.ServiceID,
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d advertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ frc = frc2;
+ if (frc != FSUCCESS)
+ CERROR ("Error %d advertising BUD "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+out:
+ PORTAL_FREE(fod, sizeof(*fod));
+ return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return;
+
+ fill_fod(fod, FabOpDeleteServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ if ((frc2 == FSUCCESS) == !!expect_success)
+ goto out;
+
+ if (expect_success)
+ CERROR("Error %d unadvertising NID "LPX64"\n",
+ frc2, kibnal_data.kib_nid);
+ else
+ CWARN("Removed conflicting NID "LPX64"\n",
+ kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+ struct timeval tv;
+ lib_ni_t *ni = &kibnal_lib.libnal_ni;
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+ nid, ni->ni_pid.nid);
+
+ do_gettimeofday(&tv);
+
+ down (&kibnal_data.kib_nid_mutex);
+
+ if (nid == kibnal_data.kib_nid) {
+ /* no change of NID */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+ kibnal_data.kib_nid, nid);
+
+ if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+ kibnal_unadvertise (1);
+
+ frc = iibt_cm_cancel(kibnal_data.kib_cep);
+ if (frc != FSUCCESS && frc != FPENDING)
+ CERROR ("Error %d stopping listener\n", frc);
+
+ frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+ if (frc != FSUCCESS)
+ CERROR ("Error %d destroying CEP\n", frc);
+
+ kibnal_data.kib_cep = NULL;
+ }
+
+ kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ /* Delete all existing peers and their connections after new
+ * NID/incarnation set to ensure no old connections in our brave
+ * new world. */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+
+ if (kibnal_data.kib_nid == PTL_NID_ANY) {
+ /* No new NID to install */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ /* remove any previous advert (crashed node etc) */
+ kibnal_unadvertise(0);
+
+ kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (kibnal_data.kib_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ rc = -ENOMEM;
+ } else {
+ CM_LISTEN_INFO info;
+ memset (&info, 0, sizeof(info));
+ info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+ frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+ kibnal_listen_callback, NULL);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("iibt_cm_listen error: %d\n", frc);
+ rc = -EINVAL;
+ } else {
+ rc = 0;
+ }
+ }
+
+ if (rc == 0) {
+ rc = kibnal_advertise();
+ if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+ kibnal_check_advert();
+#endif
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ iibt_cm_cancel (kibnal_data.kib_cep);
+ iibt_cm_destroy_cep (kibnal_data.kib_cep);
+ /* remove any peers that sprung up while I failed to
+ * advertise myself */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+ }
+
+ kibnal_data.kib_nid = PTL_NID_ANY;
+ up (&kibnal_data.kib_nid_mutex);
+ return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ LASSERT (nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC (peer, sizeof (*peer));
+ if (peer == NULL)
+ return (NULL);
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ peer->ibp_nid = nid;
+ atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
+
+ INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
+ INIT_LIST_HEAD (&peer->ibp_conns);
+ INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+ peer->ibp_reconnect_time = jiffies;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ atomic_inc (&kibnal_data.kib_npeers);
+ return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+ LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (!kibnal_peer_active(peer));
+ LASSERT (peer->ibp_connecting == 0);
+ LASSERT (list_empty (&peer->ibp_conns));
+ LASSERT (list_empty (&peer->ibp_tx_queue));
+
+ PORTAL_FREE (peer, sizeof (*peer));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+ struct list_head *peer_list = kibnal_nid2peerlist (nid);
+ struct list_head *tmp;
+ kib_peer_t *peer;
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+ LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+ peer->ibp_connecting != 0 || /* creating conns */
+ !list_empty (&peer->ibp_conns)); /* active conn */
+
+ if (peer->ibp_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+ peer, nid, atomic_read (&peer->ibp_refcount));
+ return (peer);
+ }
+ return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ read_lock (&kibnal_data.kib_global_lock);
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) /* +1 ref for caller? */
+ kib_peer_addref(peer);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (list_empty(&peer->ibp_conns));
+
+ LASSERT (kibnal_peer_active(peer));
+ list_del_init (&peer->ibp_list);
+ /* lose peerlist's ref */
+ kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (index-- > 0)
+ continue;
+
+ *nidp = peer->ibp_nid;
+ *persistencep = peer->ibp_persistence;
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (0);
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+
+ if (nid == PTL_NID_ANY)
+ return (-EINVAL);
+
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL)
+ return (-ENOMEM);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked (nid);
+ if (peer2 != NULL) {
+ kib_peer_decref (peer);
+ peer = peer2;
+ } else {
+ /* peer table takes existing ref on peer */
+ list_add_tail (&peer->ibp_list,
+ kibnal_nid2peerlist (nid));
+ }
+
+ peer->ibp_persistence++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ kib_conn_t *conn;
+
+ if (!single_share)
+ peer->ibp_persistence = 0;
+ else if (peer->ibp_persistence > 0)
+ peer->ibp_persistence--;
+
+ if (peer->ibp_persistence != 0)
+ return;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ kibnal_close_conn_locked (conn, 0);
+ }
+
+ /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+ unsigned long flags;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kib_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+ continue;
+
+ kibnal_del_peer_locked (peer, single_share);
+ rc = 0; /* matched something */
+
+ if (single_share)
+ goto out;
+ }
+ }
+ out:
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence > 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (conn);
+ }
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+ kib_conn_t *conn;
+ int i;
+ __u64 vaddr = 0;
+ __u64 vaddr_base;
+ int page_offset;
+ int ipage;
+ int rc;
+ FSTATUS frc;
+ union {
+ IB_QP_ATTRIBUTES_CREATE qp_create;
+ IB_QP_ATTRIBUTES_MODIFY qp_attr;
+ } params;
+
+ PORTAL_ALLOC (conn, sizeof (*conn));
+ if (conn == NULL) {
+ CERROR ("Can't allocate connection\n");
+ return (NULL);
+ }
+
+ /* zero flags, NULL pointers etc... */
+ memset (conn, 0, sizeof (*conn));
+
+ INIT_LIST_HEAD (&conn->ibc_tx_queue);
+ INIT_LIST_HEAD (&conn->ibc_active_txs);
+ spin_lock_init (&conn->ibc_lock);
+
+ atomic_inc (&kibnal_data.kib_nconns);
+ /* well not really, but I call destroy() on failure, which decrements */
+
+ PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+ if (conn->ibc_rxs == NULL)
+ goto failed;
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+ if (rc != 0)
+ goto failed;
+
+ vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+ for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
+
+ rx->rx_conn = conn;
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ rx->rx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ rx->rx_vaddr = vaddr;
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+ }
+ }
+
+ params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+ .Type = QPTypeReliableConnected,
+ .SendQDepth = IBNAL_TX_MAX_SG *
+ IBNAL_MSG_QUEUE_SIZE,
+ .RecvQDepth = IBNAL_MSG_QUEUE_SIZE,
+ .SendDSListDepth = 1,
+ .RecvDSListDepth = 1,
+ .SendCQHandle = kibnal_data.kib_cq,
+ .RecvCQHandle = kibnal_data.kib_cq,
+ .PDHandle = kibnal_data.kib_pd,
+ .SendSignaledCompletions = TRUE,
+ };
+ frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
+ &conn->ibc_qp, &conn->ibc_qp_attrs);
+ if (rc != 0) {
+ CERROR ("Failed to create queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* Mark QP created */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+ params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateInit,
+ .Attrs = (IB_QP_ATTR_PORTGUID |
+ IB_QP_ATTR_PKEYINDEX |
+ IB_QP_ATTR_ACCESSCONTROL),
+ .PortGUID = kibnal_data.kib_port_guid,
+ .PkeyIndex = 0,
+ .AccessControl = {
+ .s = {
+ .RdmaWrite = 1,
+ .RdmaRead = 1,
+ },
+ },
+ };
+ rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL);
+ if (rc != 0) {
+ CERROR ("Failed to modify queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* 1 ref for caller */
+ atomic_set (&conn->ibc_refcount, 1);
+ return (conn);
+
+ failed:
+ kibnal_destroy_conn (conn);
+ return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG (D_NET, "connection %p\n", conn);
+
+ LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+ LASSERT (list_empty(&conn->ibc_tx_queue));
+ LASSERT (list_empty(&conn->ibc_active_txs));
+ LASSERT (conn->ibc_nsends_posted == 0);
+ LASSERT (conn->ibc_connreq == NULL);
+
+ switch (conn->ibc_state) {
+ case IBNAL_CONN_DISCONNECTED:
+ /* called after connection sequence initiated */
+ /* fall through */
+
+ case IBNAL_CONN_INIT_QP:
+ /* _destroy includes an implicit Reset of the QP which
+ * discards posted work */
+ rc = iibt_qp_destroy(conn->ibc_qp);
+ if (rc != 0)
+ CERROR("Can't destroy QP: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_CONN_INIT_NOTHING:
+ break;
+
+ default:
+ LASSERT (0);
+ }
+
+ if (conn->ibc_cep != NULL) {
+ frc = iibt_cm_destroy_cep(conn->ibc_cep);
+ if (frc != 0)
+ CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep,
+ frc);
+ }
+
+ if (conn->ibc_rx_pages != NULL)
+ kibnal_free_pages(conn->ibc_rx_pages);
+
+ if (conn->ibc_rxs != NULL)
+ PORTAL_FREE(conn->ibc_rxs,
+ IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ if (conn->ibc_peer != NULL)
+ kib_peer_decref(conn->ibc_peer);
+
+ PORTAL_FREE(conn, sizeof (*conn));
+
+ atomic_dec(&kibnal_data.kib_nconns);
+
+ if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+ kibnal_data.kib_shutdown) {
+ /* I just nuked the last connection on shutdown; wake up
+ * everyone so they can exit. */
+ wake_up_all(&kibnal_data.kib_sched_waitq);
+ wake_up_all(&kibnal_data.kib_connd_waitq);
+ }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+ if (!atomic_dec_and_test (&conn->ibc_refcount))
+ return;
+
+ /* must disconnect before dropping the final ref */
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ count++;
+ kibnal_close_conn_locked (conn, why);
+ }
+
+ return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ if (conn->ibc_incarnation == incarnation)
+ continue;
+
+ CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+ peer->ibp_nid, conn->ibc_incarnation, incarnation);
+
+ count++;
+ kibnal_close_conn_locked (conn, -ESTALE);
+ }
+
+ return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ int count = 0;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+ continue;
+
+ count += kibnal_close_peer_conns_locked (peer, 0);
+ }
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* wildcards always succeed */
+ if (nid == PTL_NID_ANY)
+ return (0);
+
+ return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+ int rc = -EINVAL;
+ ENTRY;
+
+ LASSERT (pcfg != NULL);
+
+ switch(pcfg->pcfg_command) {
+ case NAL_CMD_GET_PEER: {
+ ptl_nid_t nid = 0;
+ int share_count = 0;
+
+ rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ &nid, &share_count);
+ pcfg->pcfg_nid = nid;
+ pcfg->pcfg_size = 0;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_count = 0;
+ pcfg->pcfg_wait = share_count;
+ break;
+ }
+ case NAL_CMD_ADD_PEER: {
+ rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_DEL_PEER: {
+ rc = kibnal_del_peer (pcfg->pcfg_nid,
+ /* flags == single_share */
+ pcfg->pcfg_flags != 0);
+ break;
+ }
+ case NAL_CMD_GET_CONN: {
+ kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+ if (conn == NULL)
+ rc = -ENOENT;
+ else {
+ rc = 0;
+ pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_flags = 0;
+ kibnal_put_conn (conn);
+ }
+ break;
+ }
+ case NAL_CMD_CLOSE_CONNECTION: {
+ rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_REGISTER_MYNID: {
+ if (pcfg->pcfg_nid == PTL_NID_ANY)
+ rc = -EINVAL;
+ else
+ rc = kibnal_set_mynid (pcfg->pcfg_nid);
+ break;
+ }
+ }
+
+ RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+ int npages = p->ibp_npages;
+ int rc;
+ int i;
+
+ if (p->ibp_mapped) {
+ rc = iibt_deregister_memory(p->ibp_handle);
+ if (rc != 0)
+ CERROR ("Deregister error: %d\n", rc);
+ }
+
+ for (i = 0; i < npages; i++)
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
+
+ PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+ kib_pages_t *p;
+ __u64 *phys_pages;
+ int i;
+ FSTATUS frc;
+ IB_ACCESS_CONTROL access;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+ if (p == NULL) {
+ CERROR ("Can't allocate buffer %d\n", npages);
+ return (-ENOMEM);
+ }
+
+ memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
+
+ for (i = 0; i < npages; i++) {
+ p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+ if (p->ibp_pages[i] == NULL) {
+ CERROR ("Can't allocate page %d of %d\n", i, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+ PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+ if (phys_pages == NULL) {
+ CERROR ("Can't allocate physarray for %d pages\n", npages);
+ /* XXX free ibp_pages? */
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ /* if we were using the _contig_ registration variant we would have
+ * an array of PhysAddr/Length pairs, but the discontiguous variant
+ * just takes the PhysAddr */
+ for (i = 0; i < npages; i++)
+ phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ 0, /* requested vaddr */
+ phys_pages, npages,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &p->ibp_handle, &p->ibp_vaddr,
+ &p->ibp_lkey, &p->ibp_rkey);
+
+ PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+
+ if (frc != FSUCCESS) {
+ CERROR ("Error %d mapping %d pages\n", frc, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+ "lkey %x rkey %x\n", npages, p->ibp_handle,
+ p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+
+ p->ibp_mapped = 1;
+out:
+ *pp = p;
+ return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+ int ipage = 0;
+ int page_offset = 0;
+ __u64 vaddr;
+ __u64 vaddr_base;
+ struct page *page;
+ kib_tx_t *tx;
+ int i;
+ int rc;
+
+ /* pre-mapped messages are not bigger than 1 page */
+ LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+ /* No fancy arithmetic when we do the buffer calculations */
+ LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
+ 0);
+ if (rc != 0)
+ return (rc);
+
+ /* ignored for the whole_mem case */
+ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+ for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+ tx = &kibnal_data.kib_tx_descs[i];
+
+ memset (tx, 0, sizeof(*tx)); /* zero flags etc */
+
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ tx->tx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ tx->tx_vaddr = vaddr;
+
+ tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+
+ CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
+ i, tx, tx->tx_msg, tx->tx_vaddr);
+
+ if (tx->tx_isnblk)
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_nblk_txs);
+ else
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_txs);
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ }
+ }
+
+ return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+ int i;
+ int rc;
+
+ if (nal->nal_refct != 0) {
+ /* This module got the first ref */
+ PORTAL_MODULE_UNUSE;
+ return;
+ }
+
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+
+ LASSERT(nal == &kibnal_api);
+
+ switch (kibnal_data.kib_init) {
+ default:
+ CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+ LBUG();
+
+ case IBNAL_INIT_ALL:
+ /* stop calls to nal_cmd */
+ libcfs_nal_cmd_unregister(IIBNAL);
+ /* No new peers */
+
+ /* resetting my NID to unadvertises me, removes my
+ * listener and nukes all current peers */
+ kibnal_set_mynid (PTL_NID_ANY);
+
+ /* Wait for all peer state to clean up (crazy) */
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d peers to disconnect (can take a few seconds)\n",
+ atomic_read (&kibnal_data.kib_npeers));
+ set_current_state (TASK_UNINTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_CQ:
+ rc = iibt_cq_destroy(kibnal_data.kib_cq);
+ if (rc != 0)
+ CERROR ("Destroy CQ error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_TXD:
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
+ /* fall through */
+
+ case IBNAL_INIT_MR:
+ if (kibnal_data.kib_md.md_handle != NULL) {
+ rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+ if (rc != FSUCCESS)
+ CERROR ("Deregister memory: %d\n", rc);
+ }
+ /* fall through */
+
+#if IBNAL_FMR
+ case IBNAL_INIT_FMR:
+ rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+ if (rc != 0)
+ CERROR ("Destroy FMR pool error: %d\n", rc);
+ /* fall through */
+#endif
+ case IBNAL_INIT_PD:
+ rc = iibt_pd_free(kibnal_data.kib_pd);
+ if (rc != 0)
+ CERROR ("Destroy PD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_SD:
+ rc = iibt_sd_deregister(kibnal_data.kib_sd);
+ if (rc != 0)
+ CERROR ("Deregister SD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_PORT:
+ /* XXX ??? */
+ /* fall through */
+
+ case IBNAL_INIT_PORTATTRS:
+ PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ /* fall through */
+
+ case IBNAL_INIT_HCA:
+ rc = iibt_close_hca(kibnal_data.kib_hca);
+ if (rc != 0)
+ CERROR ("Close HCA error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_LIB:
+ lib_fini(&kibnal_lib);
+ /* fall through */
+
+ case IBNAL_INIT_DATA:
+ /* Module refcount only gets to zero when all peers
+ * have been closed so all lists must be empty */
+ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_peers != NULL);
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+ }
+ LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+ LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+ LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+ LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+ /* flag threads to terminate; wake and wait for them to die */
+ kibnal_data.kib_shutdown = 1;
+ wake_up_all (&kibnal_data.kib_sched_waitq);
+ wake_up_all (&kibnal_data.kib_connd_waitq);
+
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read (&kibnal_data.kib_nthreads));
+ set_current_state (TASK_INTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_NOTHING:
+ break;
+ }
+
+ if (kibnal_data.kib_tx_descs != NULL)
+ PORTAL_FREE (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+ if (kibnal_data.kib_peers != NULL)
+ PORTAL_FREE (kibnal_data.kib_peers,
+ sizeof (struct list_head) *
+ kibnal_data.kib_peer_hash_size);
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+ printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+ atomic_read(&portal_kmemory));
+
+ kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+ ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+ struct sysinfo si;
+ __u64 ret;
+
+ /* XXX we don't bother with first-gen cards */
+ if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+ return 0ULL;
+
+ si_meminfo(&si);
+ ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+ return roundup_power(ret, 128 * 1024 * 1024);
+}
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+ ptl_ni_limits_t *requested_limits,
+ ptl_ni_limits_t *actual_limits)
+{
+ ptl_process_id_t process_id;
+ int pkmem = atomic_read(&portal_kmemory);
+ IB_PORT_ATTRIBUTES *pattr;
+ FSTATUS frc;
+ int rc;
+ int n;
+ int i;
+
+ LASSERT (nal == &kibnal_api);
+
+ if (nal->nal_refct != 0) {
+ if (actual_limits != NULL)
+ *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+ /* This module got the first ref */
+ PORTAL_MODULE_USE;
+ return (PTL_OK);
+ }
+
+ LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+ frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+ &kibnal_data.kib_interfaces);
+ if (frc != FSUCCESS) {
+ CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+ frc);
+ return -ENOSYS;
+ }
+
+ init_MUTEX (&kibnal_data.kib_nid_mutex);
+ init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+ kibnal_data.kib_nid = PTL_NID_ANY;
+
+ rwlock_init(&kibnal_data.kib_global_lock);
+
+ kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+ PORTAL_ALLOC (kibnal_data.kib_peers,
+ sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+ if (kibnal_data.kib_peers == NULL) {
+ goto failed;
+ }
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+ spin_lock_init (&kibnal_data.kib_connd_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+ spin_lock_init (&kibnal_data.kib_sched_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+ init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+ spin_lock_init (&kibnal_data.kib_tx_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+ init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+ PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL) {
+ CERROR ("Can't allocate tx descs\n");
+ goto failed;
+ }
+
+ /* lists/ptrs/locks initialised */
+ kibnal_data.kib_init = IBNAL_INIT_DATA;
+ /*****************************************************/
+
+ process_id.pid = 0;
+ process_id.nid = kibnal_data.kib_nid;
+
+ rc = lib_init(&kibnal_lib, nal, process_id,
+ requested_limits, actual_limits);
+ if (rc != PTL_OK) {
+ CERROR("lib_init failed: error %d\n", rc);
+ goto failed;
+ }
+
+ /* lib interface initialised */
+ kibnal_data.kib_init = IBNAL_INIT_LIB;
+ /*****************************************************/
+
+ for (i = 0; i < IBNAL_N_SCHED; i++) {
+ rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+ if (rc != 0) {
+ CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+ i, rc);
+ goto failed;
+ }
+ }
+
+ rc = kibnal_thread_start (kibnal_connd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't spawn iibnal connd: %d\n", rc);
+ goto failed;
+ }
+
+ n = sizeof(kibnal_data.kib_hca_guids) /
+ sizeof(kibnal_data.kib_hca_guids[0]);
+ frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get channel adapter guids: %d\n", frc);
+ goto failed;
+ }
+ if (n == 0) {
+ CERROR ("No channel adapters found\n");
+ goto failed;
+ }
+
+ /* Infinicon has per-HCA rather than per CQ completion handlers */
+ frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+ kibnal_ca_callback,
+ kibnal_ca_async_callback,
+ &kibnal_data.kib_hca,
+ &kibnal_data.kib_hca);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't open CA[0]: %d\n", frc);
+ goto failed;
+ }
+
+ /* Channel Adapter opened */
+ kibnal_data.kib_init = IBNAL_INIT_HCA;
+ /*****************************************************/
+
+ kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+ kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+ frc = iibt_query_hca(kibnal_data.kib_hca,
+ &kibnal_data.kib_hca_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't size port attrs: %d\n", frc);
+ goto failed;
+ }
+
+ PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+ goto failed;
+
+ /* Port attrs allocated */
+ kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+ /*****************************************************/
+
+ frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+ NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+ goto failed;
+ }
+
+ for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+ pattr != NULL;
+ i++, pattr = pattr->Next) {
+ switch (pattr->PortState) {
+ default:
+ CERROR("Unexpected port[%d] state %d\n",
+ i, pattr->PortState);
+ continue;
+ case PortStateDown:
+ CDEBUG(D_NET, "port[%d] Down\n", i);
+ continue;
+ case PortStateInit:
+ CDEBUG(D_NET, "port[%d] Init\n", i);
+ continue;
+ case PortStateArmed:
+ CDEBUG(D_NET, "port[%d] Armed\n", i);
+ continue;
+
+ case PortStateActive:
+ CDEBUG(D_NET, "port[%d] Active\n", i);
+ kibnal_data.kib_port = i;
+ kibnal_data.kib_port_guid = pattr->GUID;
+ kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+ break;
+ }
+ break;
+ }
+
+ if (pattr == NULL) {
+ CERROR ("Can't find an active port\n");
+ goto failed;
+ }
+
+ CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+
+ /* Active port found */
+ kibnal_data.kib_init = IBNAL_INIT_PORT;
+ /*****************************************************/
+
+ frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't register with SD: %d\n", frc);
+ goto failed;
+ }
+
+ /* Registered with SD OK */
+ kibnal_data.kib_init = IBNAL_INIT_SD;
+ /*****************************************************/
+
+ frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create PD: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag PD initialised */
+ kibnal_data.kib_init = IBNAL_INIT_PD;
+ /*****************************************************/
+
+#if IBNAL_FMR
+ {
+ const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+ struct ib_fmr_pool_param params = {
+ .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ),
+ .pool_size = pool_size,
+ .dirty_watermark = (pool_size * 3)/4,
+ .flush_function = NULL,
+ .flush_arg = NULL,
+ .cache = 1,
+ };
+ rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
+ &kibnal_data.kib_fmr_pool);
+ if (rc != 0) {
+ CERROR ("Can't create FMR pool size %d: %d\n",
+ pool_size, rc);
+ goto failed;
+ }
+ }
+
+ /* flag FMR pool initialised */
+ kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+ /*****************************************************/
+ if (IBNAL_WHOLE_MEM) {
+ IB_MR_PHYS_BUFFER phys;
+ IB_ACCESS_CONTROL access;
+ kib_md_t *md = &kibnal_data.kib_md;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ phys.PhysAddr = 0;
+ phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+ if (phys.Length == 0) {
+ CERROR ("couldn't determine the end of phys mem\n");
+ goto failed;
+ }
+
+ rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+ 0,
+ &phys, 1,
+ 0,
+ kibnal_data.kib_pd,
+ access,
+ &md->md_handle,
+ &md->md_addr,
+ &md->md_lkey,
+ &md->md_rkey);
+ if (rc != FSUCCESS) {
+ CERROR("registering physical memory failed: %d\n",
+ rc);
+ CERROR("falling back to registration per-rdma\n");
+ md->md_handle = NULL;
+ } else {
+ CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+ phys.Length);
+ kibnal_data.kib_init = IBNAL_INIT_MR;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = kibnal_setup_tx_descs();
+ if (rc != 0) {
+ CERROR ("Can't register tx descs: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag TX descs initialised */
+ kibnal_data.kib_init = IBNAL_INIT_TXD;
+ /*****************************************************/
+
+ {
+ uint32 nentries;
+
+ frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+ &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+ &nentries);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create RX CQ: %d\n", frc);
+ goto failed;
+ }
+
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+ if (nentries < IBNAL_CQ_ENTRIES) {
+ CERROR ("CQ only has %d entries, need %d\n",
+ nentries, IBNAL_CQ_ENTRIES);
+ goto failed;
+ }
+
+ rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+ if (rc != 0) {
+ CERROR ("Failed to re-arm completion queue: %d\n", rc);
+ goto failed;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kibnal_data.kib_init = IBNAL_INIT_ALL;
+ /*****************************************************/
+
+ printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+ "(initial mem %d)\n", pkmem);
+
+ return (PTL_OK);
+
+ failed:
+ kibnal_api_shutdown (&kibnal_api);
+ return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+ PtlNIFini(kibnal_ni);
+
+ ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+ int rc;
+
+ if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+ CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+ return -EINVAL;
+ }
+
+ /* the following must be sizeof(int) for proc_dointvec() */
+ if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+ CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+ return -EINVAL;
+ }
+
+ kibnal_api.nal_ni_init = kibnal_api_startup;
+ kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+ /* Initialise dynamic tunables to defaults once only */
+ kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+ rc = ptl_register_nal(IIBNAL, &kibnal_api);
+ if (rc != PTL_OK) {
+ CERROR("Can't register IBNAL: %d\n", rc);
+ return (-ENOMEM); /* or something... */
+ }
+
+ /* Pure gateways want the NAL started up at module load time... */
+ rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+ if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+ ptl_unregister_nal(IIBNAL);
+ return (-ENODEV);
+ }
+
+#ifdef CONFIG_SYSCTL
+ /* Press on regardless even if registering sysctl doesn't work */
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+ return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */
+#else
+# define IBNAL_N_SCHED 1 /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY 5 /* # times to retry */
+#define IBNAL_RNR_RETRY 5 /* */
+#define IBNAL_CM_RETRY 5 /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL 1
+#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */
+
+#define IBNAL_NTX 64 /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region. this will change if we register all memory. */
+#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+ 1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \
+ (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE 0x0eeb0000
+#define IBNAL_FMR 0
+#define IBNAL_WHOLE_MEM 1
+#define IBNAL_CKSUM 0
+//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct
+{
+ int kib_io_timeout; /* comms timeout (seconds) */
+ struct ctl_table_header *kib_sysctl; /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+ int ibp_npages; /* # pages */
+ int ibp_mapped; /* mapped? */
+ __u64 ibp_vaddr; /* mapped region vaddr */
+ __u32 ibp_lkey; /* mapped region lkey */
+ __u32 ibp_rkey; /* mapped region rkey */
+ IB_HANDLE ibp_handle; /* mapped region handle */
+ struct page *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+ IB_HANDLE md_handle;
+ __u32 md_lkey;
+ __u32 md_rkey;
+ __u64 md_addr;
+} kib_md_t __attribute__((packed));
+
+typedef struct
+{
+ int kib_init; /* initialisation state */
+ __u64 kib_incarnation; /* which one am I */
+ int kib_shutdown; /* shut down? */
+ atomic_t kib_nthreads; /* # live threads */
+
+ __u64 kib_service_id; /* service number I listen on */
+ __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/
+ __u16 kib_port_pkey; /* my pkey, whatever that is */
+ ptl_nid_t kib_nid; /* my NID */
+ struct semaphore kib_nid_mutex; /* serialise NID ops */
+ struct semaphore kib_nid_signal; /* signal completion */
+ IB_HANDLE kib_cep; /* connection end point */
+
+ rwlock_t kib_global_lock; /* stabilize peer/conn ops */
+
+ struct list_head *kib_peers; /* hash table of all my known peers */
+ int kib_peer_hash_size; /* size of kib_peers */
+ atomic_t kib_npeers; /* # peers extant */
+ atomic_t kib_nconns; /* # connections extant */
+
+ struct list_head kib_connd_conns; /* connections to progress */
+ struct list_head kib_connd_peers; /* peers waiting for a connection */
+ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */
+ unsigned long kib_connd_waketime; /* when connd will wake */
+ spinlock_t kib_connd_lock; /* serialise */
+
+ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */
+ struct list_head kib_sched_txq; /* tx requiring attention */
+ struct list_head kib_sched_rxq; /* rx requiring attention */
+ spinlock_t kib_sched_lock; /* serialise */
+
+ struct kib_tx *kib_tx_descs; /* all the tx descriptors */
+ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */
+
+ struct list_head kib_idle_txs; /* idle tx descriptors */
+ struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */
+ __u64 kib_next_tx_cookie; /* RDMA completion cookie */
+ spinlock_t kib_tx_lock; /* serialise */
+
+ IB_HANDLE kib_hca; /* The HCA */
+ int kib_port; /* port on the device */
+ IB_HANDLE kib_pd; /* protection domain */
+ IB_HANDLE kib_sd; /* SD handle */
+ IB_HANDLE kib_cq; /* completion queue */
+ kib_md_t kib_md; /* full-mem registration */
+
+ void *kib_listen_handle; /* where I listen for connections */
+
+ IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */
+
+ uint64 kib_hca_guids[8]; /* all the HCA guids */
+ IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */
+ FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING 0
+#define IBNAL_INIT_DATA 1
+#define IBNAL_INIT_LIB 2
+#define IBNAL_INIT_HCA 3
+#define IBNAL_INIT_PORTATTRS 4
+#define IBNAL_INIT_PORT 5
+#define IBNAL_INIT_SD 6
+#define IBNAL_INIT_PD 7
+#define IBNAL_INIT_FMR 8
+#define IBNAL_INIT_MR 9
+#define IBNAL_INIT_TXD 10
+#define IBNAL_INIT_CQ 11
+#define IBNAL_INIT_ALL 12
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+ __u32 rd_key; /* remote key */
+ __u32 rd_nob; /* # of bytes */
+ __u64 rd_addr; /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+ ptl_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma. they are built on the passive
+ * side and sent to the active side as remote arguments. On the active side
+ * the descs are used as a data structure on the way to local gather items.
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+ ptl_hdr_t ibrm_hdr; /* portals header */
+ __u64 ibrm_cookie; /* opaque completion cookie */
+ __u32 ibrm_num_descs; /* how many descs */
+ kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+ offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __u32 ibcm_status; /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+ __u32 ibm_magic; /* I'm an openibnal message */
+ __u16 ibm_version; /* this is my version number */
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+#if IBNAL_CKSUM
+ __u32 ibm_nob;
+ __u32 ibm_cksum;
+#endif
+ union {
+ kib_immediate_msg_t immediate;
+ kib_rdma_msg_t rdma;
+ kib_completion_msg_t completion;
+ } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
+#define IBNAL_MSG_VERSION 1 /* current protocol version */
+
+#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ struct kib_conn *rx_conn; /* owning conn */
+ int rx_rdma; /* RDMA completion posted? */
+ int rx_posted; /* posted? */
+ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
+ IB_WORK_REQ rx_wrq;
+ IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx /* transmit message */
+{
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ int tx_isnblk; /* I'm reserved for non-blocking sends */
+ struct kib_conn *tx_conn; /* owning conn */
+ int tx_mapped; /* mapped for RDMA? */
+ int tx_sending; /* # tx callbacks outstanding */
+ int tx_status; /* completion status */
+ unsigned long tx_deadline; /* completion deadline */
+ int tx_passive_rdma; /* peer sucks/blows */
+ int tx_passive_rdma_wait; /* waiting for peer to complete */
+ __u64 tx_passive_rdma_cookie; /* completion cookie */
+ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */
+ kib_md_t tx_md; /* RDMA mapping (active/passive) */
+ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
+ int tx_nsp; /* # send work items */
+ IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */
+ IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED 0
+#define KIB_TX_MAPPED 1
+#define KIB_TX_MAPPED_FMR 2
+
+typedef struct kib_wire_connreq
+{
+ __u32 wcr_magic; /* I'm an openibnal connreq */
+ __u16 wcr_version; /* this is my version number */
+ __u16 wcr_queue_depth; /* this is my receive queue size */
+ __u64 wcr_nid; /* peer's NID */
+ __u64 wcr_incarnation; /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+ __u64 hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+ /* connection-in-progress */
+ struct kib_conn *cr_conn;
+ kib_wire_connreq_t cr_wcr;
+ __u64 cr_tid;
+ IB_SERVICE_RECORD cr_service;
+ kib_gid_t cr_gid;
+ IB_PATH_RECORD cr_path;
+ CM_REQUEST_INFO cr_cmreq;
+ CM_CONN_INFO cr_discarded;
+ CM_REJECT_INFO cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{
+ struct kib_peer *ibc_peer; /* owning peer */
+ struct list_head ibc_list; /* stash on peer's conn list */
+ __u64 ibc_incarnation; /* which instance of the peer */
+ atomic_t ibc_refcount; /* # users */
+ int ibc_state; /* what's happening */
+ atomic_t ibc_nob; /* # bytes buffered */
+ int ibc_nsends_posted; /* # uncompleted sends */
+ int ibc_credits; /* # credits I have */
+ int ibc_outstanding_credits; /* # credits to return */
+ int ibc_rcvd_disconnect;/* received discon request */
+ int ibc_sent_disconnect;/* sent discon request */
+ struct list_head ibc_tx_queue; /* send queue */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
+ spinlock_t ibc_lock; /* serialise */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+ IB_HANDLE ibc_qp; /* queue pair */
+ IB_HANDLE ibc_cep; /* connection ID? */
+ IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */
+ kib_connreq_t *ibc_connreq; /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */
+#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING 2 /* started to connect */
+#define IBNAL_CONN_ESTABLISHED 3 /* connection established */
+#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */
+#define IBNAL_CONN_DREQ 5 /* sent disconnect req */
+#define IBNAL_CONN_DREP 6 /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do { \
+ LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \
+ LASSERTF(low <= high, "%d %d\n", low, high); \
+ LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+ "%d\n", conn->ibc_state); \
+} while (0)
+
+typedef struct kib_peer
+{
+ struct list_head ibp_list; /* stash on global peer list */
+ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */
+ ptl_nid_t ibp_nid; /* who's on the other end(s) */
+ atomic_t ibp_refcount; /* # users */
+ int ibp_persistence; /* "known" peer refs */
+ struct list_head ibp_conns; /* all active connections */
+ struct list_head ibp_tx_queue; /* msgs waiting for a conn */
+ int ibp_connecting; /* connecting+accepting */
+ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */
+ unsigned long ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t kibnal_lib;
+extern kib_data_t kibnal_data;
+extern kib_tunables_t kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+ return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64 hca_guid,
+ IB_COMPLETION_CALLBACK completion_callback,
+ IB_ASYNC_EVENT_CALLBACK async_event_callback,
+ void *arg,
+ IB_HANDLE *handle)
+{
+ return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+ async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+ return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+ return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+ return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+ return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ void *phys_buffers, uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+ phys_buffers, nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ IB_MR_PHYS_BUFFER *phys_buffers,
+ uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle,
+ requested_io_va,
+ phys_buffers,
+ nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle,
+ void *virt_addr, unsigned int length,
+ IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterMemRegion(hca_handle,
+ virt_addr, length,
+ pd_handle,
+ access,
+ mem_handle,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+ return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+ void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+ return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+ arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+ return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+ return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+ return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+ void *arg, IB_HANDLE *cq_handle,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle,
+ query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+ void **arg_ptr)
+{
+ return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+ return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+ return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+ return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+ FABRIC_OPERATION_DATA *fod,
+ PFABRIC_OPERATION_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+ fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+ QUERY *qry,
+ PQUERY_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+ qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+ return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+ uint32 offset)
+{
+ return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+ return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+ return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep,
+ CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+ PFN_CM_CALLBACK callback, void *arg,
+ IB_HANDLE *new_cep)
+{
+ return IIBT_IF.Cmi.CmAccept(cep,
+ send_info, recv_info,
+ callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+ return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+ CM_DREPLY_INFO *reply)
+{
+ return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+ return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ atomic_inc(&peer->ibp_refcount); \
+} while (0)
+
+#define kib_peer_decref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ if (atomic_dec_and_test (&peer->ibp_refcount)) { \
+ CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \
+ peer->ibp_nid, peer); \
+ kibnal_destroy_peer (peer); \
+ } \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+
+ return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+ /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+ LASSERT (tx->tx_nsp > 0); /* work items set up */
+ LASSERT (tx->tx_conn == NULL); /* only set here */
+
+ tx->tx_conn = conn;
+ tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+ /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+ return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+ LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+ memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+ strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+ *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+ struct ib_qp_attribute qp_attr;
+ int rc;
+
+ memset (&qp_attr, 0, sizeof(qp_attr));
+ rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+ if (rc != 0) {
+ CERROR ("Can't get qp attrs: %d\n", rc);
+ return;
+ }
+
+ CWARN ("RDMA CAPABILITY: write %s read %s\n",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+ __u64 page_number = p - mem_map;
+
+ return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive. It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & 1) == 0);
+ return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+ return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+ return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int kibnal_close_stale_conns_locked (kib_peer_t *peer,
+ __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int kibnal_scheduler(void *arg);
+extern int kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ * LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+ ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+ unsigned long flags;
+ int i;
+ FSTATUS frc;
+
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
+ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
+
+ switch (tx->tx_mapped) {
+ default:
+ LBUG();
+
+ case KIB_TX_UNMAPPED:
+ break;
+
+ case KIB_TX_MAPPED:
+ if (in_interrupt()) {
+ /* can't deregister memory in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+ frc = iibt_deregister_memory(tx->tx_md.md_handle);
+ LASSERT (frc == FSUCCESS);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+
+#if IBNAL_FMR
+ case KIB_TX_MAPPED_FMR:
+ if (in_interrupt() && tx->tx_status != 0) {
+ /* can't flush FMRs in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+
+ rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+ LASSERT (rc == 0);
+
+ if (tx->tx_status != 0)
+ ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+#endif
+ }
+
+ for (i = 0; i < 2; i++) {
+ /* tx may have up to 2 libmsgs to finalise */
+ if (tx->tx_libmsg[i] == NULL)
+ continue;
+
+ lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+ tx->tx_libmsg[i] = NULL;
+ }
+
+ if (tx->tx_conn != NULL) {
+ kibnal_put_conn (tx->tx_conn);
+ tx->tx_conn = NULL;
+ }
+
+ tx->tx_nsp = 0;
+ tx->tx_passive_rdma = 0;
+ tx->tx_status = 0;
+
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ if (tx->tx_isnblk) {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+ } else {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ wake_up (&kibnal_data.kib_idle_tx_waitq);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block)
+{
+ unsigned long flags;
+ kib_tx_t *tx = NULL;
+ ENTRY;
+
+ for (;;) {
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ /* "normal" descriptor is free */
+ if (!list_empty (&kibnal_data.kib_idle_txs)) {
+ tx = list_entry (kibnal_data.kib_idle_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ if (!may_block) {
+ /* may dip into reserve pool */
+ if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+ CERROR ("reserved tx desc pool exhausted\n");
+ break;
+ }
+
+ tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ /* block for idle tx */
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ wait_event (kibnal_data.kib_idle_tx_waitq,
+ !list_empty (&kibnal_data.kib_idle_txs) ||
+ kibnal_data.kib_shutdown);
+ }
+
+ if (tx != NULL) {
+ list_del (&tx->tx_list);
+
+ /* Allocate a new passive RDMA completion cookie. It might
+ * not be needed, but we've got a lock right now and we're
+ * unlikely to wrap... */
+ tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT (tx->tx_nsp == 0);
+ LASSERT (tx->tx_sending == 0);
+ LASSERT (tx->tx_status == 0);
+ LASSERT (tx->tx_conn == NULL);
+ LASSERT (!tx->tx_passive_rdma);
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_libmsg[0] == NULL);
+ LASSERT (tx->tx_libmsg[1] == NULL);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+ /* I would guess that if kibnal_get_peer (nid) == NULL,
+ and we're not routing, then 'nid' is very distant :) */
+ if ( nal->libnal_ni.ni_pid.nid == nid ) {
+ *dist = 0;
+ } else {
+ *dist = 1;
+ }
+
+ return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+ struct list_head *ttmp;
+ unsigned long flags;
+ int idle;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (!tx->tx_passive_rdma_wait ||
+ tx->tx_passive_rdma_cookie != cookie)
+ continue;
+
+ CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+ tx->tx_status = status;
+ tx->tx_passive_rdma_wait = 0;
+ idle = (tx->tx_sending == 0);
+
+ if (idle)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* I could be racing with tx callbacks. It's whoever
+ * _makes_ tx idle that frees it */
+ if (idle)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+ cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+ if (kibnal_whole_mem())
+ return kibnal_data.kib_md.md_lkey;
+
+ return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ int rc = 0;
+ unsigned long flags;
+ FSTATUS frc;
+ ENTRY;
+
+ rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = rx->rx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(conn->ibc_rx_pages),
+ };
+
+ rx->rx_wrq = (IB_WORK_REQ) {
+ .Operation = WROpRecv,
+ .DSListDepth = 1,
+ .MessageLen = IBNAL_MSG_SIZE,
+ .WorkReqId = kibnal_ptr2wreqid(rx, 1),
+ .DSList = &rx->rx_gl,
+ };
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DREP);
+ LASSERT (!rx->rx_posted);
+ rx->rx_posted = 1;
+ mb();
+
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+ rc = -ECONNABORTED;
+ else {
+ frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+ if (frc != FSUCCESS) {
+ CDEBUG(D_NET, "post failed %d\n", frc);
+ rc = -EINVAL;
+ }
+ CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+ }
+
+ if (rc == 0) {
+ if (do_credits) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_outstanding_credits++;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+ EXIT;
+ return;
+ }
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ CERROR ("Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ kibnal_close_conn (rx->rx_conn, rc);
+ } else {
+ CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ }
+
+ /* Drop rx's ref */
+ kibnal_put_conn (conn);
+ EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+ unsigned char *c = ptr;
+ int i;
+
+ return;
+
+ if (len < 0 || len > 2048) {
+ printk("XXX what the hell? %d\n",len);
+ return;
+ }
+
+ printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+ for (i = 0; i < len;) {
+ printk("%02x",*(c++));
+ i++;
+ if (!(i & 15)) {
+ printk("\n");
+ } else if (!(i&1)) {
+ printk(" ");
+ }
+ }
+
+ if(len & 15) {
+ printk("\n");
+ }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ int nob = wc->Length;
+ const int base_nob = offsetof(kib_msg_t, ibm_u);
+ int credits;
+ int flipped;
+ unsigned long flags;
+ __u32 i;
+#if IBNAL_CKSUM
+ __u32 msg_cksum;
+ __u32 computed_cksum;
+#endif
+
+ /* we set the QP to erroring after we've finished disconnecting,
+ * maybe we should do so sooner. */
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DISCONNECTED);
+
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ LASSERT (rx->rx_posted);
+ rx->rx_posted = 0;
+ mb();
+
+ /* receives complete with error in any case after we've started
+ * disconnecting */
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ goto failed;
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR("Rx from "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ goto failed;
+ }
+
+ if (nob < base_nob) {
+ CERROR ("Short rx from "LPX64": %d < expected %d\n",
+ conn->ibc_peer->ibp_nid, nob, base_nob);
+ goto failed;
+ }
+
+ hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+ /* Receiver does any byte flipping if necessary... */
+
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+ flipped = 0;
+ } else {
+ if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Unrecognised magic: %08x from "LPX64"\n",
+ msg->ibm_magic, conn->ibc_peer->ibp_nid);
+ goto failed;
+ }
+ flipped = 1;
+ __swab16s (&msg->ibm_version);
+ LASSERT (sizeof(msg->ibm_type) == 1);
+ LASSERT (sizeof(msg->ibm_credits) == 1);
+ }
+
+ if (msg->ibm_version != IBNAL_MSG_VERSION) {
+ CERROR ("Incompatible msg version %d (%d expected)\n",
+ msg->ibm_version, IBNAL_MSG_VERSION);
+ goto failed;
+ }
+
+#if IBNAL_CKSUM
+ if (nob != msg->ibm_nob) {
+ CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+ goto failed;
+ }
+
+ msg_cksum = le32_to_cpu(msg->ibm_cksum);
+ msg->ibm_cksum = 0;
+ computed_cksum = kibnal_cksum (msg, nob);
+
+ if (msg_cksum != computed_cksum) {
+ CERROR ("Checksum failure %d: (%d expected)\n",
+ computed_cksum, msg_cksum);
+// goto failed;
+ }
+ CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+ /* Have I received credits that will let me send? */
+ credits = msg->ibm_credits;
+ if (credits != 0) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_credits += credits;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_NOOP:
+ kibnal_post_rx (rx, 1);
+ return;
+
+ case IBNAL_MSG_IMMEDIATE:
+ if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+ CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ case IBNAL_MSG_GET_RDMA:
+ if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+ CERROR ("Short RDMA msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+ CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+ msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+ if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+ (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) >
+ min(nob, IBNAL_MSG_SIZE))) {
+ CERROR ("num_descs %d too large\n",
+ msg->ibm_u.rdma.ibrm_num_descs);
+ goto failed;
+ }
+
+ for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+ if (flipped) {
+ __swab32(desc->rd_key);
+ __swab32(desc->rd_nob);
+ __swab64(desc->rd_addr);
+ }
+
+ CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n",
+ desc->rd_key, desc->rd_addr, desc->rd_nob);
+ }
+ break;
+
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+ CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+
+ CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+ msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+
+ kibnal_complete_passive_rdma (conn,
+ msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+ kibnal_post_rx (rx, 1);
+ return;
+
+ default:
+ CERROR ("Can't parse type from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, msg->ibm_type);
+ goto failed;
+ }
+
+ /* schedule for kibnal_rx() in thread context */
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ return;
+
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kibnal_close_conn(conn, -ECONNABORTED);
+
+ /* Don't re-post rx & drop its ref on conn */
+ kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+
+ /* Clear flag so I can detect if I've sent an RDMA completion */
+ rx->rx_rdma = 0;
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_GET_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ /* If the incoming get was matched, I'll have initiated the
+ * RDMA and the completion message... */
+ if (rx->rx_rdma)
+ break;
+
+ /* Otherwise, I'll send a failed completion now to prevent
+ * the peer's GET blocking for the full timeout. */
+ CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+ rx, NULL, 0, NULL, NULL, 0, 0);
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ if (rx->rx_rdma)
+ break;
+ /* This is most unusual, since even if lib_parse() didn't
+ * match anything, it should have asked us to read (and
+ * discard) the payload. The portals header must be
+ * inconsistent with this message type, so it's the
+ * sender's fault for sending garbage and she can time
+ * herself out... */
+ CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ break;
+
+ case IBNAL_MSG_IMMEDIATE:
+ lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+ LASSERT (!rx->rx_rdma);
+ break;
+
+ default:
+ LBUG();
+ break;
+ }
+
+ kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+ struct page *page;
+
+ if (vaddr >= VMALLOC_START &&
+ vaddr < VMALLOC_END)
+ page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+ else if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+ page = vmalloc_to_page ((void *)vaddr);
+ /* in 2.4 ^ just walks the page tables */
+#endif
+ else
+ page = virt_to_page (vaddr);
+
+ if (!VALID_PAGE (page))
+ page = NULL;
+
+ return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+ unsigned long len, int active)
+{
+ kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+ kib_rdma_desc_t *desc;
+
+ LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n",
+ ibrm->ibrm_num_descs);
+
+ desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+ if (active)
+ desc->rd_key = kibnal_data.kib_md.md_lkey;
+ else
+ desc->rd_key = kibnal_data.kib_md.md_rkey;
+ desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+ desc->rd_addr = kibnal_page2phys(page) + page_offset +
+ kibnal_data.kib_md.md_addr;
+
+ ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+ struct page *page;
+ int page_offset, len;
+
+ while (nob > 0) {
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL)
+ return -EFAULT;
+
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ len = min(nob, (int)PAGE_SIZE - page_offset);
+
+ kibnal_fill_ibrm(tx, page, page_offset, len, active);
+ nob -= len;
+ vaddr += len;
+ }
+ return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int niov, struct iovec *iov, int offset, int nob, int active)
+
+{
+ void *vaddr;
+ FSTATUS frc;
+
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
+
+ if (nob > iov->iov_len - offset) {
+ CERROR ("Can't map multiple vaddr fragments\n");
+ return (-EMSGSIZE);
+ }
+
+ /* our large contiguous iov could be backed by multiple physical
+ * pages. */
+ if (kibnal_whole_mem()) {
+ int rc;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base +
+ offset, nob, active);
+ if (rc != 0) {
+ CERROR ("Can't map iov: %d\n", rc);
+ return rc;
+ }
+ return 0;
+ }
+
+ vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+ tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+ frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+ kibnal_data.kib_pd, access,
+ &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+ if (frc != 0) {
+ CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+ return -EINVAL;
+ }
+
+ tx->tx_mapped = KIB_TX_MAPPED;
+ return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int nkiov, ptl_kiov_t *kiov,
+ int offset, int nob, int active)
+{
+ __u64 *phys = NULL;
+ int page_offset;
+ int nphys;
+ int resid;
+ int phys_size = 0;
+ FSTATUS frc;
+ int i, rc = 0;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
+ }
+
+ page_offset = kiov->kiov_offset + offset;
+ nphys = 1;
+
+ if (!kibnal_whole_mem()) {
+ phys_size = nkiov * sizeof (*phys);
+ PORTAL_ALLOC(phys, phys_size);
+ if (phys == NULL) {
+ CERROR ("Can't allocate tmp phys\n");
+ return (-ENOMEM);
+ }
+
+ phys[0] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset,
+ kiov->kiov_len, active);
+ }
+
+ resid = nob - (kiov->kiov_len - offset);
+
+ while (resid > 0) {
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+
+ if (kiov->kiov_offset != 0 ||
+ ((resid > PAGE_SIZE) &&
+ kiov->kiov_len < PAGE_SIZE)) {
+ /* Can't have gaps */
+ CERROR ("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %d, len %d \n", nphys,
+ kiov->kiov_offset, kiov->kiov_len);
+
+ for (i = -nphys; i < nkiov; i++)
+ {
+ CERROR("kiov[%d] %p +%d for %d\n",
+ i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+ }
+
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (nphys == PTL_MD_MAX_IOV) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+
+ if (!kibnal_whole_mem()) {
+ LASSERT (nphys * sizeof (*phys) < phys_size);
+ phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+ kibnal_fill_ibrm(tx, kiov->kiov_page,
+ kiov->kiov_offset, kiov->kiov_len,
+ active);
+ }
+
+ nphys ++;
+ resid -= PAGE_SIZE;
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+#if 0
+ CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+ for (i = 0; i < nphys; i++)
+ CWARN (" [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+ rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+ phys, nphys,
+ &tx->tx_md.md_addr,
+ page_offset,
+ &tx->tx_md.md_handle.fmr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#else
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ IBNAL_RDMA_BASE,
+ phys, nphys,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &tx->tx_md.md_handle,
+ &tx->tx_md.md_addr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#endif
+ if (frc == FSUCCESS) {
+ CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+ nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+ tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+ tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+ } else {
+ CERROR ("Can't map phys: %d\n", rc);
+ rc = -EFAULT;
+ }
+
+ out:
+ if (phys != NULL)
+ PORTAL_FREE(phys, phys_size);
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+ struct list_head *tmp;
+
+ /* just return the first connection */
+ list_for_each (tmp, &peer->ibp_conns) {
+ return (list_entry(tmp, kib_conn_t, ibc_list));
+ }
+
+ return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_tx_t *tx;
+ int rc;
+ int i;
+ int done;
+ int nwork;
+ ENTRY;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+ if (list_empty(&conn->ibc_tx_queue) &&
+ conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ tx = kibnal_get_idle_tx(0); /* don't block */
+ if (tx != NULL)
+ kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ if (tx != NULL) {
+ atomic_inc(&conn->ibc_refcount);
+ kibnal_queue_tx_locked(tx, conn);
+ }
+ }
+
+ while (!list_empty (&conn->ibc_tx_queue)) {
+ tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+ /* We rely on this for QP sizing */
+ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+ LASSERT (conn->ibc_outstanding_credits >= 0);
+ LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_credits >= 0);
+ LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+ /* Not on ibc_rdma_queue */
+ LASSERT (!tx->tx_passive_rdma_wait);
+
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+ GOTO(out, 0);
+
+ if (conn->ibc_credits == 0) /* no credits */
+ GOTO(out, 1);
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ GOTO(out, 2);
+
+ list_del (&tx->tx_list);
+
+ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+ (!list_empty(&conn->ibc_tx_queue) ||
+ conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ kibnal_tx_done(tx);
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ continue;
+ }
+
+ tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+ conn->ibc_outstanding_credits = 0;
+
+ conn->ibc_nsends_posted++;
+ conn->ibc_credits--;
+
+ /* we only get a tx completion for the final rdma op */
+ tx->tx_sending = min(tx->tx_nsp, 2);
+ tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_cksum = 0;
+ tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+ CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* NB the gap between removing tx from the queue and sending it
+ * allows message re-ordering to occur */
+
+ LASSERT (tx->tx_nsp > 0);
+
+ rc = -ECONNABORTED;
+ nwork = 0;
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ tx->tx_status = 0;
+ /* Driver only accepts 1 item at a time */
+ for (i = 0; i < tx->tx_nsp; i++) {
+ hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+ rc = iibt_postsend(conn->ibc_qp,
+ &tx->tx_wrq[i]);
+ if (rc != 0)
+ break;
+ if (wrq_signals_completion(&tx->tx_wrq[i]))
+ nwork++;
+ CDEBUG(D_NET, "posted tx wrq %p\n",
+ &tx->tx_wrq[i]);
+ }
+ }
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+ if (rc != 0) {
+ /* NB credits are transferred in the actual
+ * message, which can only be the last work item */
+ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+ conn->ibc_credits++;
+ conn->ibc_nsends_posted--;
+
+ tx->tx_status = rc;
+ tx->tx_passive_rdma_wait = 0;
+ tx->tx_sending -= tx->tx_nsp - nwork;
+
+ done = (tx->tx_sending == 0);
+ if (done)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ CERROR ("Error %d posting transmit to "LPX64"\n",
+ rc, conn->ibc_peer->ibp_nid);
+ else
+ CDEBUG (D_NET, "Error %d posting transmit to "
+ LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+ kibnal_close_conn (conn, rc);
+
+ if (done)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ }
+
+ EXIT;
+out:
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_conn_t *conn;
+ unsigned long flags;
+ int idle;
+
+ conn = tx->tx_conn;
+ LASSERT (conn != NULL);
+ LASSERT (tx->tx_sending != 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+ tx->tx_sending, tx->tx_nsp, wc->Status);
+
+ /* I could be racing with rdma completion. Whoever makes 'tx' idle
+ * gets to free it, which also drops its ref on 'conn'. If it's
+ * not me, then I take an extra ref on conn so it can't disappear
+ * under me. */
+
+ tx->tx_sending--;
+ idle = (tx->tx_sending == 0) && /* This is the final callback */
+ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ if (idle)
+ list_del(&tx->tx_list);
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ if (tx->tx_sending == 0)
+ conn->ibc_nsends_posted--;
+
+ if (wc->Status != WRStatusSuccess &&
+ tx->tx_status == 0)
+ tx->tx_status = -ECONNABORTED;
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ if (idle)
+ kibnal_tx_done (tx);
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR ("Tx completion to "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ kibnal_close_conn (conn, -ENETDOWN);
+ } else {
+ /* can I shovel some more sends out the door? */
+ kibnal_check_sends(conn);
+ }
+
+ kibnal_put_conn (conn);
+}
+
+void
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+ /* XXX flesh out. this seems largely for async errors */
+ CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+ IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+ IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+ IB_WORK_COMPLETION wc;
+ int armed = 0;
+
+ CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+ for(;;) {
+ while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+ if (kibnal_wreqid_is_rx(wc.WorkReqId))
+ kibnal_rx_callback(&wc);
+ else
+ kibnal_tx_callback(&wc);
+ }
+ if (armed)
+ return;
+ if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+ CERROR("rearm failed?\n");
+ return;
+ }
+ armed = 1;
+ }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+ IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp];
+ int fence;
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+ LASSERT (tx->tx_nsp >= 0 &&
+ tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+ LASSERT (nob <= IBNAL_MSG_SIZE);
+
+ tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+ tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+ tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_nob = nob;
+#endif
+ /* Fence the message if it's bundled with an RDMA read */
+ fence = (tx->tx_nsp > 0) &&
+ (type == IBNAL_MSG_PUT_DONE);
+
+ *gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = tx->tx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages),
+ };
+
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = WROpSend;
+ wrq->DSList = gl;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = nob;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 1;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = fence;
+
+ tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ kibnal_queue_tx_locked (tx, conn);
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx->tx_nsp > 0); /* work items have been set up */
+
+ read_lock (g_lock);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ read_unlock (g_lock);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ read_unlock (g_lock);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock (g_lock);
+ write_lock_irqsave (g_lock, flags);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ write_unlock_irqrestore (g_lock, flags);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ if (peer->ibp_connecting == 0) {
+ if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ peer->ibp_connecting = 1;
+ kib_peer_addref(peer); /* extra ref for connd */
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&peer->ibp_connd_list,
+ &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+ }
+
+ /* A connection is being established; queue the message... */
+ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+ write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+ lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+ int nob = libmsg->md->length;
+ kib_tx_t *tx;
+ kib_msg_t *ibmsg;
+ int rc;
+ IB_ACCESS_CONTROL access = {0,};
+
+ LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+ LASSERT (nob > 0);
+ LASSERT (!in_interrupt()); /* Mapping could block */
+
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
+ LASSERT (tx != NULL);
+
+ if ((libmsg->md->options & PTL_MD_KIOV) == 0)
+ rc = kibnal_map_iov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.iov,
+ 0, nob, 0);
+ else
+ rc = kibnal_map_kiov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.kiov,
+ 0, nob, 0);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+ goto failed;
+ }
+
+ if (type == IBNAL_MSG_GET_RDMA) {
+ /* reply gets finalized when tx completes */
+ tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
+ nid, libmsg);
+ if (tx->tx_libmsg[1] == NULL) {
+ CERROR ("Can't create reply for GET -> "LPX64"\n",
+ nid);
+ rc = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ tx->tx_passive_rdma = 1;
+
+ ibmsg = tx->tx_msg;
+
+ ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+ ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+ /* map_kiov alrady filled the rdma descs for the whole_mem case */
+ if (!kibnal_whole_mem()) {
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ kibnal_init_tx_msg (tx, type,
+ kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+ CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+ LPX64", nob %d\n",
+ tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+ tx->tx_md.md_addr, nob);
+
+ /* libmsg gets finalized when tx completes. */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+
+ failed:
+ tx->tx_status = rc;
+ kibnal_tx_done (tx);
+ return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob)
+{
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_msg_t *txmsg;
+ kib_tx_t *tx;
+ IB_ACCESS_CONTROL access = {0,};
+ IB_WR_OP rdma_op;
+ int rc;
+ __u32 i;
+
+ CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+ type, status, niov, offset, nob);
+
+ /* Called by scheduler */
+ LASSERT (!in_interrupt ());
+
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ /* No data if we're completing with failure */
+ LASSERT (status == 0 || nob == 0);
+
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
+
+ /* Flag I'm completing the RDMA. Even if I fail to send the
+ * completion message, I will have tried my best so further
+ * attempts shouldn't be tried. */
+ LASSERT (!rx->rx_rdma);
+ rx->rx_rdma = 1;
+
+ if (type == IBNAL_MSG_GET_DONE) {
+ rdma_op = WROpRdmaWrite;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+ } else {
+ access.s.LocalWrite = 1;
+ rdma_op = WROpRdmaRead;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+ }
+
+ tx = kibnal_get_idle_tx (0); /* Mustn't block */
+ if (tx == NULL) {
+ CERROR ("tx descs exhausted on RDMA from "LPX64
+ " completing locally with failure\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+ return;
+ }
+ LASSERT (tx->tx_nsp == 0);
+
+ if (nob == 0)
+ GOTO(init_tx, 0);
+
+ /* We actually need to transfer some data (the transfer
+ * size could get truncated to zero when the incoming
+ * message is matched) */
+ if (kiov != NULL)
+ rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+ else
+ rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA -> "LPX64": %d\n",
+ rx->rx_conn->ibc_peer->ibp_nid, rc);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ if (!kibnal_whole_mem()) {
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ /* XXX ugh. different page-sized hosts. */
+ if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+ rxmsg->ibm_u.rdma.ibrm_num_descs) {
+ CERROR("tx descs (%u) != rx descs (%u)\n",
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+ rxmsg->ibm_u.rdma.ibrm_num_descs);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ /* map_kiov filled in the rdma descs which describe our side of the
+ * rdma transfer. */
+ /* ibrm_num_descs was verified in rx_callback */
+ for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+ IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[i];
+
+ ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+ rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+ ds->Address = ldesc->rd_addr;
+ ds->Length = ldesc->rd_nob;
+ ds->Lkey = ldesc->rd_key;
+
+ memset(wrq, 0, sizeof(*wrq));
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = rdma_op;
+ wrq->DSList = ds;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = ds->Length;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = 0;
+ wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+ wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+ /* only the last rdma post triggers tx completion */
+ if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+ tx->tx_nsp++;
+ }
+
+init_tx:
+ txmsg = tx->tx_msg;
+
+ txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+ txmsg->ibm_u.completion.ibcm_status = status;
+
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+ if (status == 0 && nob != 0) {
+ LASSERT (tx->tx_nsp > 1);
+ /* RDMA: libmsg gets finalized when the tx completes. This
+ * is after the completion message has been sent, which in
+ * turn is after the RDMA has finished. */
+ tx->tx_libmsg[0] = libmsg;
+ } else {
+ LASSERT (tx->tx_nsp == 1);
+ /* No RDMA: local completion happens now! */
+ CDEBUG(D_WARNING,"No data: immediate completion\n");
+ lib_finalize (&kibnal_lib, NULL, libmsg,
+ status == 0 ? PTL_OK : PTL_FAIL);
+ }
+
+ /* +1 ref for this tx... */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ rx->rx_conn, rx->rx_conn->ibc_state,
+ rx->rx_conn->ibc_peer->ibp_nid,
+ atomic_read (&rx->rx_conn->ibc_refcount));
+ atomic_inc (&rx->rx_conn->ibc_refcount);
+ /* ...and queue it up */
+ kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t *nal,
+ void *private,
+ lib_msg_t *libmsg,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
+ size_t payload_nob)
+{
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+ " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+ /* Thread context if we're sending payload */
+ LASSERT (!in_interrupt() || payload_niov == 0);
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+ switch (type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case PTL_MSG_REPLY: {
+ /* reply's 'private' is the incoming receive */
+ kib_rx_t *rx = private;
+
+ /* RDMA reply expected? */
+ if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+ kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+ rx, libmsg, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
+ return (PTL_OK);
+ }
+
+ /* Incoming message consistent with immediate reply? */
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+ CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+ nid, rx->rx_msg->ibm_type);
+ return (PTL_FAIL);
+ }
+
+ /* Will it fit in a message? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob >= IBNAL_MSG_SIZE) {
+ CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
+ nid, payload_nob);
+ return (PTL_FAIL);
+ }
+ break;
+ }
+
+ case PTL_MSG_GET:
+ /* might the REPLY message be big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
+ nid, libmsg, hdr));
+ break;
+
+ case PTL_MSG_ACK:
+ LASSERT (payload_nob == 0);
+ break;
+
+ case PTL_MSG_PUT:
+ /* Is the payload big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+ nid, libmsg, hdr));
+
+ break;
+ }
+
+ tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+ type == PTL_MSG_REPLY ||
+ in_interrupt()));
+ if (tx == NULL) {
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
+ type, nid, in_interrupt() ? " (intr)" : "");
+ return (PTL_NO_SPACE);
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+ if (payload_nob > 0) {
+ if (payload_kiov != NULL)
+ lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ }
+
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+ offsetof(kib_immediate_msg_t,
+ ibim_payload[payload_nob]));
+
+ /* libmsg gets finalized when tx completes */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, struct iovec *payload_iov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, ptl_kiov_t *payload_kiov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+ unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ int msg_nob;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (!in_interrupt ());
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ switch (rxmsg->ibm_type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case IBNAL_MSG_IMMEDIATE:
+ msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (msg_nob > IBNAL_MSG_SIZE) {
+ CERROR ("Immediate message from "LPX64" too big: %d\n",
+ rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+ return (PTL_FAIL);
+ }
+
+ if (kiov != NULL)
+ lib_copy_buf2kiov(niov, kiov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+ else
+ lib_copy_buf2iov(niov, iov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_GET_RDMA:
+ /* We get called here just to discard any junk after the
+ * GET hdr. */
+ LASSERT (libmsg == NULL);
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_PUT_RDMA:
+ kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+ rx, libmsg,
+ niov, iov, kiov, offset, mlen);
+ return (PTL_OK);
+ }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+ offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+ offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management. active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses.
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+ long pid = kernel_thread (fn, arg, 0);
+
+ if (pid < 0)
+ return ((int)pid);
+
+ atomic_inc (&kibnal_data.kib_nthreads);
+ return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+ atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection. if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context. It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immmediate housekeeping, and schedules the
+ * connection for the connd to finish off.
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+ IBNAL_CONN_DISCONNECTED);
+
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ return; /* already disconnecting */
+
+ CDEBUG (error == 0 ? D_NET : D_ERROR,
+ "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ /* kib_connd_conns takes ibc_list's ref */
+ list_del (&conn->ibc_list);
+ } else {
+ /* new ref for kib_connd_conns */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ }
+
+ if (list_empty (&peer->ibp_conns) &&
+ peer->ibp_persistence == 0) {
+ /* Non-persistent peer with no more conns... */
+ kibnal_unlink_peer_locked (peer);
+ }
+
+ conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_close_conn_locked (conn, error);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+ LIST_HEAD (zombies);
+ kib_tx_t *tx;
+ unsigned long flags;
+
+ LASSERT (rc != 0);
+ LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+ peer->ibp_connecting--;
+
+ if (peer->ibp_connecting != 0) {
+ /* another connection attempt under way (loopback?)... */
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return;
+ }
+
+ if (list_empty(&peer->ibp_conns)) {
+ /* Say when active connection can be re-attempted */
+ peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+ /* Increase reconnection interval */
+ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+ IBNAL_MAX_RECONNECT_INTERVAL);
+
+ /* Take peer's blocked blocked transmits; I'll complete
+ * them with error */
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add_tail (&tx->tx_list, &zombies);
+ }
+
+ if (kibnal_peer_active(peer) &&
+ (peer->ibp_persistence == 0)) {
+ /* failed connection attempt on non-persistent peer */
+ kibnal_unlink_peer_locked (peer);
+ }
+ } else {
+ /* Can't have blocked transmits if there are connections */
+ LASSERT (list_empty(&peer->ibp_tx_queue));
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ if (!list_empty (&zombies))
+ CERROR ("Deleting messages for "LPX64": connection failed\n",
+ peer->ibp_nid);
+
+ while (!list_empty (&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ /* complete now */
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+ int state = conn->ibc_state;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int i;
+
+ /* passive connection has no connreq & vice versa */
+ LASSERTF(!active == !(conn->ibc_connreq != NULL),
+ "%d %p\n", active, conn->ibc_connreq);
+ if (active) {
+ PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ conn->ibc_connreq = NULL;
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (status == 0) {
+ /* connection established... */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+ conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+ if (!kibnal_peer_active(peer)) {
+ /* ...but peer deleted meantime */
+ status = -ECONNABORTED;
+ }
+ } else {
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+ IBNAL_CONN_CONNECTING);
+ }
+
+ if (status == 0) {
+ /* Everything worked! */
+
+ peer->ibp_connecting--;
+
+ /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+ * the IB_CM_IDLE callback */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ list_add (&conn->ibc_list, &peer->ibp_conns);
+
+ /* reset reconnect interval for next attempt */
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ /* post blocked sends to the new connection */
+ spin_lock (&conn->ibc_lock);
+
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+
+ /* +1 ref for each tx */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ kibnal_queue_tx_locked (tx, conn);
+ }
+
+ spin_unlock (&conn->ibc_lock);
+
+ /* Nuke any dangling conns from a different peer instance... */
+ kibnal_close_stale_conns_locked (conn->ibc_peer,
+ conn->ibc_incarnation);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* queue up all the receives */
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
+ /* +1 ref for rx desc */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+ i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+ conn->ibc_rxs[i].rx_vaddr);
+
+ kibnal_post_rx (&conn->ibc_rxs[i], 0);
+ }
+
+ kibnal_check_sends (conn);
+ return;
+ }
+
+ /* connection failed */
+ if (state == IBNAL_CONN_CONNECTING) {
+ /* schedule for connd to close */
+ kibnal_close_conn_locked (conn, status);
+ } else {
+ /* Don't have a CM comm_id; just wait for refs to drain */
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+ /* If we didn't establish the connection we don't have to pass
+ * through the disconnect protocol before dropping the CM ref */
+ if (state < IBNAL_CONN_CONNECTING)
+ kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+ ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+ kib_conn_t *conn = kibnal_create_conn();
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ unsigned long flags;
+
+ if (conn == NULL)
+ return (-ENOMEM);
+
+ if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+ CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+ nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-EPROTO);
+ }
+
+ /* assume 'nid' is a new peer */
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL) {
+ CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-ENOMEM);
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked(nid);
+ if (peer2 == NULL) {
+ /* peer table takes my ref on peer */
+ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+ } else {
+ kib_peer_decref (peer);
+ peer = peer2;
+ }
+
+ kib_peer_addref(peer); /* +1 ref for conn */
+ peer->ibp_connecting++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ conn->ibc_peer = peer;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+ /* conn->ibc_cep is set when cm_accept is called */
+ conn->ibc_incarnation = incarnation;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ *connp = conn;
+ return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+ FSTATUS frc;
+
+ modify_attr.RequestState = state;
+
+ frc = iibt_qp_modify(qp, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int done;
+
+ /* NB we wait until the connection has closed before completing
+ * outstanding passive RDMAs so we can be sure the network can't
+ * touch the mapped memory any more. */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+ /* set the QP to the error state so that we get flush callbacks
+ * on our posted receives which can then drop their conn refs */
+ kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ /* grab passive RDMAs not waiting for the tx callback */
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ /* still waiting for tx callback? */
+ if (!tx->tx_passive_rdma_wait)
+ continue;
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_passive_rdma_wait = 0;
+ done = (tx->tx_sending == 0);
+
+ if (!done)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ /* grab all blocked transmits */
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+ CM_REJECT_INFO *rej;
+
+ PORTAL_ALLOC(rej, sizeof(*rej));
+ if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+ return;
+
+ rej->Reason = reason;
+ iibt_cm_reject(cep, rej);
+ PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res,
+ IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr;
+ FSTATUS frc;
+ ENTRY;
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToRecv,
+ .RecvPSN = IBNAL_STARTING_PSN,
+ .DestQPNumber = qpn,
+ .ResponderResources = resp_res,
+ .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
+ .Attrs = (IB_QP_ATTR_RECVPSN |
+ IB_QP_ATTR_DESTQPNUMBER |
+ IB_QP_ATTR_RESPONDERRESOURCES |
+ IB_QP_ATTR_DESTAV |
+ IB_QP_ATTR_PATHMTU |
+ IB_QP_ATTR_MINRNRTIMER),
+ };
+ GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
+ &modify_attr.DestAV);
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ RETURN(frc);
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToSend,
+ .FlowControl = TRUE,
+ .InitiatorDepth = init_depth,
+ .SendPSN = send_psn,
+ .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .Attrs = (IB_QP_ATTR_FLOWCONTROL |
+ IB_QP_ATTR_INITIATORDEPTH |
+ IB_QP_ATTR_SENDPSN |
+ IB_QP_ATTR_LOCALACKTIMEOUT |
+ IB_QP_ATTR_RETRYCOUNT |
+ IB_QP_ATTR_RNRRETRYCOUNT),
+ };
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ kib_wire_connreq_t *wcr;
+ CM_REPLY_INFO *rep = &info->Info.Reply;
+ uint16_t reason;
+ FSTATUS frc;
+
+ wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't connect "LPX64": bad magic %08x\n",
+ conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't connect "LPX64": bad version %d\n",
+ conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+ CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+ conn->ibc_peer->ibp_nid,
+ le16_to_cpu(wcr->wcr_queue_depth));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+ CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+ le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN,
+ min_t(__u8, rep->ArbInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &conn->ibc_connreq->cr_path,
+ min_t(__u8, rep->ArbResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ rep->StartingPSN);
+ if (frc != FSUCCESS) {
+ CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ GOTO(reject, reason = RC_NO_QP);
+ }
+
+ /* the callback arguments are ignored for an active accept */
+ conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+ frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded,
+ NULL, NULL, NULL, NULL);
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
+ /* XXX don't call reject after accept fails? */
+ return;
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ kibnal_connreq_done (conn, 1, 0);
+ return;
+
+reject:
+ kibnal_reject(cep, reason);
+ kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ /* Established Connection Notifier */
+ switch (info->Status) {
+ default:
+ CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+ info->Status, conn, conn->ibc_peer->ibp_nid);
+ LBUG();
+ break;
+
+ case FCM_CONNECT_REPLY:
+ kibnal_connect_reply(cep, info, arg);
+ break;
+
+ case FCM_DISCONNECT_REQUEST:
+ /* XXX lock around these state management bits? */
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ kibnal_close_conn (conn, 0);
+ conn->ibc_state = IBNAL_CONN_DREP;
+ iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ break;
+
+ /* these both guarantee that no more cm callbacks will occur */
+ case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+ case FCM_DISCONNECT_REPLY:
+ CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ kibnal_flush_pending(conn);
+ kibnal_put_conn(conn); /* Lose CM's ref */
+ break;
+ }
+
+ return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+ FSTATUS frc;
+ uint32 value = 1;
+
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+ (char *)&value, sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting timeout callback: %d\n", frc);
+ return -1;
+ }
+
+#if 0
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+ sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting async accept: %d\n", frc);
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ IB_QP_ATTRIBUTES_QUERY *query;
+ CM_REQUEST_INFO *req;
+ CM_CONN_INFO *rep = NULL, *rcv = NULL;
+ kib_wire_connreq_t *wcr;
+ kib_conn_t *conn = NULL;
+ uint16_t reason = 0;
+ FSTATUS frc;
+ int rc = 0;
+
+ LASSERT(cep);
+ LASSERT(info);
+ LASSERT(arg == NULL); /* no conn yet for passive */
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ req = &info->Info.Request;
+ wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+ CDEBUG(D_NET, "%d from "LPX64"\n", info->Status,
+ le64_to_cpu(wcr->wcr_nid));
+
+ if (info->Status == FCM_CONNECT_CANCEL)
+ return;
+
+ LASSERT (info->Status == FCM_CONNECT_REQUEST);
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't accept: bad magic %08x\n",
+ le32_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't accept: bad version %d\n",
+ le16_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ rc = kibnal_accept(&conn, cep,
+ le64_to_cpu(wcr->wcr_nid),
+ le64_to_cpu(wcr->wcr_incarnation),
+ le16_to_cpu(wcr->wcr_queue_depth));
+ if (rc != 0) {
+ CERROR ("Can't accept "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), rc);
+ GOTO(out, reason = RC_NO_RESOURCES);
+ }
+
+ frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+ min_t(__u8, req->CEPInfo.OfferedInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &req->PathInfo.Path,
+ min_t(__u8, req->CEPInfo.OfferedResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ req->CEPInfo.StartingPSN);
+
+ if (frc != FSUCCESS) {
+ CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+
+ frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+ query = &conn->ibc_qp_attrs;
+
+ PORTAL_ALLOC(rep, sizeof(*rep));
+ PORTAL_ALLOC(rcv, sizeof(*rcv));
+ if (rep == NULL || rcv == NULL) {
+ CERROR ("can't reply and receive buffers\n");
+ GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+ }
+
+ /* don't try to deref this into the incoming wcr :) */
+ wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+ rep->Info.Reply = (CM_REPLY_INFO) {
+ .QPN = query->QPNumber,
+ .QKey = query->Qkey,
+ .StartingPSN = query->RecvPSN,
+ .EndToEndFlowControl = query->FlowControl,
+ /* XXX Hmm. */
+ .ArbInitiatorDepth = query->InitiatorDepth,
+ .ArbResponderResources = query->ResponderResources,
+ .TargetAckDelay = 0,
+ .FailoverAccepted = 0,
+ .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+ };
+
+ *wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn,
+ &conn->ibc_cep);
+
+ PORTAL_FREE(rep, sizeof(*rep));
+ PORTAL_FREE(rcv, sizeof(*rcv));
+
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ /* XXX it seems we don't call reject after this point? */
+ CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+out:
+ if (reason) {
+ kibnal_reject(cep, reason);
+ rc = -ECONNABORTED;
+ }
+ if (conn != NULL)
+ kibnal_connreq_done(conn, 0, rc);
+
+ return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+ IB_PATH_RECORD *path;
+ int i;
+
+ for(i = 0; i < results->NumPathRecords; i++) {
+ path = &results->PathRecords[i];
+ CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+ LPX64":"LPX64" pkey %x\n",
+ i,
+ path->SGID.Type.Global.SubnetPrefix,
+ path->SGID.Type.Global.InterfaceID,
+ path->DGID.Type.Global.SubnetPrefix,
+ path->DGID.Type.Global.InterfaceID,
+ path->P_Key);
+ }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ PATH_RESULTS *path;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ path = (PATH_RESULTS *)query_res->QueryResult;
+
+ if (path->NumPathRecords < 1) {
+ CERROR ("expected path records: %d\n", path->NumPathRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_path_records(path);
+
+ /* just using the first. this is probably a horrible idea. */
+ conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+ conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (conn->ibc_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+ .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+ .CEPInfo = (CM_CEP_INFO) {
+ .CaGUID = kibnal_data.kib_hca_guids[0],
+ .EndToEndFlowControl = FALSE,
+ .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .AckTimeout = IBNAL_ACK_TIMEOUT,
+ .StartingPSN = IBNAL_STARTING_PSN,
+ .QPN = conn->ibc_qp_attrs.QPNumber,
+ .QKey = conn->ibc_qp_attrs.Qkey,
+ .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+ .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+ },
+ .PathInfo = (CM_CEP_PATHINFO) {
+ .bSubnetLocal = TRUE,
+ .Path = conn->ibc_connreq->cr_path,
+ },
+ };
+
+#if 0
+ /* XXX set timeout just like SDP!!!*/
+ conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+ /* Flag I'm getting involved with the CM... */
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+ CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+ conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0,
+ CM_REQUEST_INFO_USER_LEN);
+ memcpy(conn->ibc_connreq->cr_cmreq.PrivateData,
+ &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+ /* kibnal_cm_callback gets my conn ref */
+ frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+ kibnal_cm_callback, conn);
+ if (frc != FPENDING && frc != FSUCCESS) {
+ CERROR ("Connect: %d\n", frc);
+ /* Back out state change as connect failed */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+ IB_SERVICE_RECORD *svc;
+ int i;
+
+ for(i = 0; i < results->NumServiceRecords; i++) {
+ svc = &results->ServiceRecords[i];
+ CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+ i,
+ svc->RID.ServiceID,
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+ svc->RID.ServiceGID.Type.Global.InterfaceID,
+ svc->RID.ServiceP_Key);
+ }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ kib_conn_t *conn = arg;
+ SERVICE_RECORD_RESULTS *svc;
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY path_query;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+ if (svc->NumServiceRecords < 1) {
+ CERROR ("%d service records\n", svc->NumServiceRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_service_records(svc);
+
+ conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+ CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+ query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(&path_query, 0, sizeof(path_query));
+ path_query.InputType = InputTypePortGuidPair;
+ path_query.OutputType = OutputTypePathRecord;
+ path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+ path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &path_query,
+ kibnal_pathreq_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("Path record request failed: %d\n", frc);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY query;
+ FSTATUS frc;
+ kib_conn_t *conn = kibnal_create_conn();
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (conn == NULL) {
+ CERROR ("Can't allocate conn\n");
+ kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ return;
+ }
+
+ conn->ibc_peer = peer;
+ kib_peer_addref(peer);
+
+ PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ if (conn->ibc_connreq == NULL) {
+ CERROR ("Can't allocate connreq\n");
+ kibnal_connreq_done (conn, 1, -ENOMEM);
+ return;
+ }
+
+ memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+ kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+ memset(&query, 0, sizeof(query));
+ query.InputType = InputTypeServiceRecord;
+ query.OutputType = OutputTypeServiceRecord;
+ query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+ query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &query,
+ kibnal_service_get_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+ kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+ kib_tx_t *tx;
+ struct list_head *ttmp;
+ unsigned long flags;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_tx_queue) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_sending == 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+ struct list_head *peers = &kibnal_data.kib_peers[idx];
+ struct list_head *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * rdmas to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock (&kibnal_data.kib_global_lock);
+
+ list_for_each (ptmp, peers) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ kibnal_check_sends(conn);
+
+ if (!kibnal_conn_timed_out(conn))
+ continue;
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ CERROR("Timed out RDMA with "LPX64"\n",
+ peer->ibp_nid);
+
+ kibnal_close_conn (conn, -ETIMEDOUT);
+ kibnal_put_conn (conn);
+
+ /* start again now I've dropped the lock */
+ goto again;
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+ FSTATUS frc;
+
+ switch (conn->ibc_state) {
+ /* all refs have gone, free and be done with it */
+ case IBNAL_CONN_DISCONNECTED:
+ kibnal_destroy_conn (conn);
+ return; /* avoid put_conn */
+
+ case IBNAL_CONN_SEND_DREQ:
+ frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ if (frc != FSUCCESS) /* XXX do real things */
+ CERROR("disconnect failed: %d\n", frc);
+ conn->ibc_state = IBNAL_CONN_DREQ;
+ break;
+
+ /* a callback got to the conn before we did */
+ case IBNAL_CONN_DREP:
+ break;
+
+ default:
+ CERROR ("Bad conn %p state: %d\n", conn,
+ conn->ibc_state);
+ LBUG();
+ break;
+ }
+
+ /* drop ref from close_conn */
+ kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+ wait_queue_t wait;
+ unsigned long flags;
+ kib_conn_t *conn;
+ kib_peer_t *peer;
+ int timeout;
+ int i;
+ int peer_index = 0;
+ unsigned long deadline = jiffies;
+
+ kportal_daemonize ("kibnal_connd");
+ kportal_blockallsigs ();
+
+ init_waitqueue_entry (&wait, current);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ for (;;) {
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del (&conn->ibc_list);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ kib_connd_handle_state(conn);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ continue;
+ }
+
+ if (!list_empty (&kibnal_data.kib_connd_peers)) {
+ peer = list_entry (kibnal_data.kib_connd_peers.next,
+ kib_peer_t, ibp_connd_list);
+
+ list_del_init (&peer->ibp_connd_list);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_connect_peer (peer);
+ kib_peer_decref (peer);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ /* shut down and nobody left to reap... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ /* careful with the jiffy wrap... */
+ while ((timeout = (int)(deadline - jiffies)) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = kibnal_data.kib_peer_hash_size;
+
+ /* Time to check for RDMA timeouts on a few more
+ * peers: I do checks every 'p' seconds on a
+ * proportion of the peer table and I need to check
+ * every connection 'n' times within a timeout
+ * interval, to ensure I detect a timeout on any
+ * connection within (n+1)/n times the timeout
+ * interval. */
+
+ if (kibnal_tunables.kib_io_timeout > n * p)
+ chunk = (chunk * n * p) /
+ kibnal_tunables.kib_io_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ kibnal_check_conns (peer_index);
+ peer_index = (peer_index + 1) %
+ kibnal_data.kib_peer_hash_size;
+ }
+
+ deadline += p * HZ;
+ }
+
+ kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ if (!kibnal_data.kib_shutdown &&
+ list_empty (&kibnal_data.kib_connd_conns) &&
+ list_empty (&kibnal_data.kib_connd_peers))
+ schedule_timeout (timeout);
+
+ set_current_state (TASK_RUNNING);
+ remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_thread_fini ();
+ return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+ long id = (long)arg;
+ char name[16];
+ kib_rx_t *rx;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int rc;
+ int counter = 0;
+ int did_something;
+
+ snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+ kportal_daemonize(name);
+ kportal_blockallsigs();
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ for (;;) {
+ did_something = 0;
+
+ while (!list_empty(&kibnal_data.kib_sched_txq)) {
+ tx = list_entry(kibnal_data.kib_sched_txq.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ kibnal_tx_done(tx);
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+ rx = list_entry(kibnal_data.kib_sched_rxq.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+
+ kibnal_rx(rx);
+
+ did_something = 1;
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ /* shut down and no receives to complete... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ /* nothing to do or hogging CPU */
+ if (!did_something || counter++ == IBNAL_RESCHED) {
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ counter = 0;
+
+ if (!did_something) {
+ rc = wait_event_interruptible(
+ kibnal_data.kib_sched_waitq,
+ !list_empty(&kibnal_data.kib_sched_txq) ||
+ !list_empty(&kibnal_data.kib_sched_rxq) ||
+ (kibnal_data.kib_shutdown &&
+ atomic_read (&kibnal_data.kib_nconns) == 0));
+ } else {
+ our_cond_resched();
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+ }
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+ kibnal_thread_fini();
+ return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+ libnal_data: &kibnal_data, /* NAL private data */
+ libnal_send: kibnal_send,
+ libnal_send_pages: kibnal_send_pages,
+ libnal_recv: kibnal_recv,
+ libnal_recv_pages: kibnal_recv_pages,
+ libnal_dist: kibnal_dist
+};