/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (C) 2004 Cluster File Systems, Inc.
- * Author: Eric Barton <eric@bartonsoftware.com>
- * Author: Frank Zago <fzago@systemfabricworks.com>
+ * GPL HEADER START
*
- * This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * Lustre is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * Lustre is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
- * You should have received a copy of the GNU General Public License
- * along with Lustre; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
*
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/viblnd/viblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ * Author: Frank Zago <fzago@systemfabricworks.com>
*/
-#include "vibnal.h"
-
-nal_t kibnal_api;
-ptl_handle_ni_t kibnal_ni;
-kib_data_t kibnal_data;
-kib_tunables_t kibnal_tunables;
-
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL 202
-
-#define IBNAL_SYSCTL_TIMEOUT 1
+#include "viblnd.h"
-static ctl_table kibnal_ctl_table[] = {
- {IBNAL_SYSCTL_TIMEOUT, "timeout",
- &kibnal_tunables.kib_io_timeout, sizeof (int),
- 0644, NULL, &proc_dointvec},
- { 0 }
+lnd_t the_kiblnd = {
+ .lnd_type = VIBLND,
+ .lnd_startup = kibnal_startup,
+ .lnd_shutdown = kibnal_shutdown,
+ .lnd_ctl = kibnal_ctl,
+ .lnd_send = kibnal_send,
+ .lnd_recv = kibnal_recv,
+ .lnd_eager_recv = kibnal_eager_recv,
};
-static ctl_table kibnal_top_ctl_table[] = {
- {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
- { 0 }
-};
-#endif
+kib_data_t kibnal_data;
void vibnal_assert_wire_constants (void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686
- * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+ * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G
+ * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
/* Constants... */
CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
- CLASSERT (IBNAL_MSG_VERSION == 6);
+ CLASSERT (IBNAL_MSG_VERSION == 0x11);
CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
CLASSERT (IBNAL_MSG_NOOP == 0xd0);
CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
-
- /* Checks for struct kib_rdma_frag_t */
- CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12);
- CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0);
- CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4);
- CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4);
- CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4);
- CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8);
- CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4);
+ CLASSERT (IBNAL_USE_FMR == 1);
/* Checks for struct kib_rdma_desc_t */
- CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8);
- CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0);
+ CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16);
+ CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0);
+ CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8);
+ CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8);
+ CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4);
+ CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12);
CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
- CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4);
- CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4);
- CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164);
- CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12);
/* Checks for struct kib_putreq_msg_t */
CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
/* Checks for struct kib_putack_msg_t */
- CLASSERT ((int)sizeof(kib_putack_msg_t) == 24);
+ CLASSERT ((int)sizeof(kib_putack_msg_t) == 32);
CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
- CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8);
+ CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16);
/* Checks for struct kib_get_msg_t */
- CLASSERT ((int)sizeof(kib_get_msg_t) == 88);
+ CLASSERT ((int)sizeof(kib_get_msg_t) == 96);
CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
- CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8);
+ CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16);
/* Checks for struct kib_completion_msg_t */
CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
/* Checks for struct kib_msg_t */
- CLASSERT ((int)sizeof(kib_msg_t) == 144);
+ CLASSERT ((int)sizeof(kib_msg_t) == 152);
CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
- CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24);
+ CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32);
CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
- CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88);
+ CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96);
CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
}
-void
-kibnal_pause(int ticks)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ticks);
-}
-
__u32
kibnal_cksum (void *ptr, int nob)
{
}
void
-kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid,
- __u64 dststamp, __u64 seq)
+kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
+ lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
{
/* CAVEAT EMPTOR! all message fields not set here should have been
* initialised previously. */
msg->ibm_magic = IBNAL_MSG_MAGIC;
- msg->ibm_version = IBNAL_MSG_VERSION;
+ msg->ibm_version = version;
/* ibm_type */
msg->ibm_credits = credits;
/* ibm_nob */
msg->ibm_cksum = 0;
- msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid;
+ msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid;
msg->ibm_srcstamp = kibnal_data.kib_incarnation;
msg->ibm_dstnid = dstnid;
msg->ibm_dststamp = dststamp;
msg->ibm_seq = seq;
-#if IBNAL_CKSUM
- /* NB ibm_cksum zero while computing cksum */
- msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+ if (*kibnal_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+ }
}
int
-kibnal_unpack_msg(kib_msg_t *msg, int nob)
+kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
{
const int hdr_size = offsetof(kib_msg_t, ibm_u);
__u32 msg_cksum;
+ __u32 msg_version;
int flip;
int msg_nob;
+#if !IBNAL_USE_FMR
int i;
int n;
-
+#endif
/* 6 bytes are enough to have received magic + version */
if (nob < 6) {
CERROR("Short message: %d\n", nob);
return -EPROTO;
}
+ /* Future protocol version compatibility support!
+ * If the viblnd-specific protocol changes, or when LNET unifies
+ * protocols over all LNDs, the initial connection will negotiate a
+ * protocol version. If I find this, I avoid any console errors. If
+ * my is doing connection establishment, the reject will tell the peer
+ * which version I'm running. */
+
if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
flip = 0;
} else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
flip = 1;
} else {
+ if (msg->ibm_magic == LNET_PROTO_MAGIC ||
+ msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+ return -EPROTO;
+
+ /* Completely out to lunch */
CERROR("Bad magic: %08x\n", msg->ibm_magic);
return -EPROTO;
}
- if (msg->ibm_version !=
- (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
- CERROR("Bad version: %d\n", msg->ibm_version);
+ msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+ if (expected_version == 0) {
+ if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+ msg_version != IBNAL_MSG_VERSION)
+ return -EPROTO;
+ } else if (msg_version != expected_version) {
+ CERROR("Bad version: %x(%x expected)\n",
+ msg_version, expected_version);
return -EPROTO;
}
return -EPROTO;
}
msg->ibm_cksum = msg_cksum;
-
+
if (flip) {
/* leave magic unflipped as a clue to peer endianness */
- __swab16s(&msg->ibm_version);
+ msg->ibm_version = msg_version;
CLASSERT (sizeof(msg->ibm_type) == 1);
CLASSERT (sizeof(msg->ibm_credits) == 1);
msg->ibm_nob = msg_nob;
__swab64s(&msg->ibm_dststamp);
__swab64s(&msg->ibm_seq);
}
-
- if (msg->ibm_srcnid == PTL_NID_ANY) {
- CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
+
+ if (msg->ibm_srcnid == LNET_NID_ANY) {
+ CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
return -EPROTO;
}
default:
CERROR("Unknown message type %x\n", msg->ibm_type);
return -EPROTO;
-
+
case IBNAL_MSG_NOOP:
break;
break;
case IBNAL_MSG_PUT_REQ:
- if (msg_nob < sizeof(msg->ibm_u.putreq)) {
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
(int)(hdr_size + sizeof(msg->ibm_u.putreq)));
return -EPROTO;
break;
case IBNAL_MSG_PUT_ACK:
- if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
- (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
+ (int)(hdr_size + sizeof(msg->ibm_u.putack)));
return -EPROTO;
}
-
+#if IBNAL_USE_FMR
+ if (flip) {
+ __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+ }
+#else
if (flip) {
__swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
__swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
}
-
+
n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
- CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
+ CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
n, IBNAL_MAX_RDMA_FRAGS);
return -EPROTO;
}
-
+
if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
(int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
return -EPROTO;
}
- if (flip)
+ if (flip) {
for (i = 0; i < n; i++) {
__swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
__swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
__swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
}
+ }
+#endif
break;
case IBNAL_MSG_GET_REQ:
(int)(hdr_size + sizeof(msg->ibm_u.get)));
return -EPROTO;
}
+#if IBNAL_USE_FMR
+ if (flip) {
+ __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+ }
+#else
if (flip) {
__swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
__swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
- CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
+ CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
n, IBNAL_MAX_RDMA_FRAGS);
return -EPROTO;
}
-
+
if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
(int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
return -EPROTO;
}
-
+
if (flip)
for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
__swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
__swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
__swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
}
+#endif
break;
case IBNAL_MSG_PUT_NAK:
}
int
-kibnal_set_mynid(ptl_nid_t nid)
+kibnal_start_listener (lnet_ni_t *ni)
{
- static cm_listen_data_t info; /* protected by kib_nid_mutex */
+ static cm_listen_data_t info;
- lib_ni_t *ni = &kibnal_lib.libnal_ni;
- int rc;
cm_return_t cmrc;
- CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
- nid, ni->ni_pid.nid);
+ LASSERT (kibnal_data.kib_listen_handle == NULL);
- down (&kibnal_data.kib_nid_mutex);
-
- if (nid == ni->ni_pid.nid) {
- /* no change of NID */
- up (&kibnal_data.kib_nid_mutex);
- return (0);
+ kibnal_data.kib_listen_handle =
+ cm_create_cep(cm_cep_transp_rc);
+ if (kibnal_data.kib_listen_handle == NULL) {
+ CERROR ("Can't create listen CEP\n");
+ return -ENOMEM;
}
- CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
-
- if (kibnal_data.kib_listen_handle != NULL) {
- cmrc = cm_cancel(kibnal_data.kib_listen_handle);
- if (cmrc != cm_stat_success)
- CERROR ("Error %d stopping listener\n", cmrc);
+ CDEBUG(D_NET, "Created CEP %p for listening\n",
+ kibnal_data.kib_listen_handle);
- kibnal_pause(HZ/10); /* ensure no more callbacks */
-
- cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
- if (cmrc != vv_return_ok)
- CERROR ("Error %d destroying CEP\n", cmrc);
+ memset(&info, 0, sizeof(info));
+ info.listen_addr.end_pt.sid =
+ (__u64)(*kibnal_tunables.kib_service_number);
- kibnal_data.kib_listen_handle = NULL;
- }
+ cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
+ kibnal_listen_callback, NULL);
+ if (cmrc == cm_stat_success)
+ return 0;
- /* Change NID. NB queued passive connection requests (if any) will be
- * rejected with an incorrect destination NID */
- ni->ni_pid.nid = nid;
- kibnal_data.kib_incarnation++;
- mb();
+ CERROR ("cm_listen error: %d\n", cmrc);
- /* Delete all existing peers and their connections after new
- * NID/incarnation set to ensure no old connections in our brave
- * new world. */
- kibnal_del_peer (PTL_NID_ANY, 0);
+ cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
+ LASSERT (cmrc == cm_stat_success);
- if (ni->ni_pid.nid != PTL_NID_ANY) { /* got a new NID to install */
- kibnal_data.kib_listen_handle =
- cm_create_cep(cm_cep_transp_rc);
- if (kibnal_data.kib_listen_handle == NULL) {
- CERROR ("Can't create listen CEP\n");
- rc = -ENOMEM;
- goto failed_0;
- }
+ kibnal_data.kib_listen_handle = NULL;
+ return -EINVAL;
+}
- CDEBUG(D_NET, "Created CEP %p for listening\n",
- kibnal_data.kib_listen_handle);
+void
+kibnal_stop_listener(lnet_ni_t *ni)
+{
+ cm_return_t cmrc;
- memset(&info, 0, sizeof(info));
- info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
+ LASSERT (kibnal_data.kib_listen_handle != NULL);
- cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
- kibnal_listen_callback, NULL);
- if (cmrc != 0) {
- CERROR ("cm_listen error: %d\n", cmrc);
- rc = -EINVAL;
- goto failed_1;
- }
- }
+ cmrc = cm_cancel(kibnal_data.kib_listen_handle);
+ if (cmrc != cm_stat_success)
+ CERROR ("Error %d stopping listener\n", cmrc);
- up (&kibnal_data.kib_nid_mutex);
- return (0);
+ cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */
- failed_1:
cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
- LASSERT (cmrc == cm_stat_success);
+ if (cmrc != vv_return_ok)
+ CERROR ("Error %d destroying CEP\n", cmrc);
+
kibnal_data.kib_listen_handle = NULL;
- failed_0:
- ni->ni_pid.nid = PTL_NID_ANY;
- kibnal_data.kib_incarnation++;
- mb();
- kibnal_del_peer (PTL_NID_ANY, 0);
- up (&kibnal_data.kib_nid_mutex);
- return rc;
}
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
{
- kib_peer_t *peer;
+ kib_peer_t *peer;
+ unsigned long flags;
+ int rc;
- LASSERT (nid != PTL_NID_ANY);
+ LASSERT (nid != LNET_NID_ANY);
- PORTAL_ALLOC(peer, sizeof (*peer));
+ LIBCFS_ALLOC(peer, sizeof (*peer));
if (peer == NULL) {
- CERROR("Canot allocate perr\n");
- return (NULL);
+ CERROR("Cannot allocate peer\n");
+ return -ENOMEM;
}
memset(peer, 0, sizeof(*peer)); /* zero flags etc */
INIT_LIST_HEAD (&peer->ibp_conns);
INIT_LIST_HEAD (&peer->ibp_tx_queue);
- peer->ibp_reconnect_time = jiffies;
- peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_error = 0;
+ peer->ibp_last_alive = cfs_time_current();
+ peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
- atomic_inc (&kibnal_data.kib_npeers);
- if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
- return peer;
-
- CERROR("Too many peers: CQ will overflow\n");
- kibnal_peer_decref(peer);
- return NULL;
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+ if (atomic_read(&kibnal_data.kib_npeers) >=
+ *kibnal_tunables.kib_concurrent_peers) {
+ rc = -EOVERFLOW; /* !! but at least it distinguishes */
+ } else if (kibnal_data.kib_listen_handle == NULL) {
+ rc = -ESHUTDOWN; /* shutdown has started */
+ } else {
+ rc = 0;
+ /* npeers only grows with the global lock held */
+ atomic_inc(&kibnal_data.kib_npeers);
+ }
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ if (rc != 0) {
+ CERROR("Can't create peer: %s\n",
+ (rc == -ESHUTDOWN) ? "shutting down" :
+ "too many peers");
+ LIBCFS_FREE(peer, sizeof(*peer));
+ } else {
+ *peerp = peer;
+ }
+
+ return rc;
}
void
kibnal_destroy_peer (kib_peer_t *peer)
{
-
LASSERT (atomic_read (&peer->ibp_refcount) == 0);
LASSERT (peer->ibp_persistence == 0);
LASSERT (!kibnal_peer_active(peer));
LASSERT (peer->ibp_connecting == 0);
+ LASSERT (peer->ibp_accepting == 0);
LASSERT (list_empty (&peer->ibp_conns));
LASSERT (list_empty (&peer->ibp_tx_queue));
-
- PORTAL_FREE (peer, sizeof (*peer));
+
+ LIBCFS_FREE (peer, sizeof (*peer));
/* NB a peer's connections keep a reference on their peer until
* they are destroyed, so we can be assured that _all_ state to do
* with this peer has been cleaned up when its refcount drops to
* zero. */
- atomic_dec (&kibnal_data.kib_npeers);
+ atomic_dec(&kibnal_data.kib_npeers);
}
-/* the caller is responsible for accounting for the additional reference
- * that this creates */
kib_peer_t *
-kibnal_find_peer_locked (ptl_nid_t nid)
+kibnal_find_peer_locked (lnet_nid_t nid)
{
+ /* the caller is responsible for accounting the additional reference
+ * that this creates */
struct list_head *peer_list = kibnal_nid2peerlist (nid);
struct list_head *tmp;
kib_peer_t *peer;
LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
peer->ibp_connecting != 0 || /* creating conns */
+ peer->ibp_accepting != 0 ||
!list_empty (&peer->ibp_conns)); /* active conn */
if (peer->ibp_nid != nid)
continue;
- CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
- peer, nid, atomic_read (&peer->ibp_refcount));
+ CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+ peer, libcfs_nid2str(nid),
+ atomic_read (&peer->ibp_refcount));
return (peer);
}
return (NULL);
}
int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
+kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp,
int *persistencep)
{
kib_peer_t *peer;
peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0 ||
!list_empty (&peer->ibp_conns));
if (index-- > 0)
}
int
-kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
+kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip)
{
kib_peer_t *peer;
kib_peer_t *peer2;
unsigned long flags;
+ int rc;
- CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
-
- if (nid == PTL_NID_ANY)
+ CDEBUG(D_NET, "%s at %u.%u.%u.%u\n",
+ libcfs_nid2str(nid), HIPQUAD(ip));
+
+ if (nid == LNET_NID_ANY)
return (-EINVAL);
- peer = kibnal_create_peer (nid);
- if (peer == NULL)
- return (-ENOMEM);
+ rc = kibnal_create_peer(&peer, nid);
+ if (rc != 0)
+ return rc;
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* I'm always called with a reference on kibnal_data.kib_ni
+ * so shutdown can't have started */
+ LASSERT (kibnal_data.kib_listen_handle != NULL);
+
peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
kibnal_peer_decref (peer);
peer->ibp_ip = ip;
peer->ibp_persistence++;
-
+
write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
return (0);
}
void
-kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer)
{
struct list_head *ctmp;
struct list_head *cnxt;
kib_conn_t *conn;
- if (!single_share)
- peer->ibp_persistence = 0;
- else if (peer->ibp_persistence > 0)
- peer->ibp_persistence--;
-
- if (peer->ibp_persistence != 0)
- return;
+ peer->ibp_persistence = 0;
if (list_empty(&peer->ibp_conns)) {
kibnal_unlink_peer_locked(peer);
}
int
-kibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (lnet_nid_t nid)
{
+ CFS_LIST_HEAD (zombies);
struct list_head *ptmp;
struct list_head *pnxt;
kib_peer_t *peer;
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- if (nid != PTL_NID_ANY)
+ if (nid != LNET_NID_ANY)
lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0 ||
!list_empty (&peer->ibp_conns));
- if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+ if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
continue;
- kibnal_del_peer_locked (peer, single_share);
- rc = 0; /* matched something */
+ if (!list_empty(&peer->ibp_tx_queue)) {
+ LASSERT (list_empty(&peer->ibp_conns));
- if (single_share)
- goto out;
+ list_splice_init(&peer->ibp_tx_queue, &zombies);
+ }
+
+ kibnal_del_peer_locked (peer);
+ rc = 0; /* matched something */
}
}
- out:
+
write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ kibnal_txlist_done(&zombies, -EIO);
+
return (rc);
}
peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence > 0 ||
peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0 ||
!list_empty (&peer->ibp_conns));
list_for_each (ctmp, &peer->ibp_conns) {
return (NULL);
}
+void
+kibnal_debug_rx (kib_rx_t *rx)
+{
+ CDEBUG(D_CONSOLE, " %p nob %d msg_type %x "
+ "cred %d seq "LPD64"\n",
+ rx, rx->rx_nob, rx->rx_msg->ibm_type,
+ rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq);
+}
+
+void
+kibnal_debug_tx (kib_tx_t *tx)
+{
+ CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx "
+ "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n",
+ tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+ tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+ tx->tx_lntmsg[0] == NULL ? "-" : "!",
+ tx->tx_lntmsg[1] == NULL ? "-" : "!",
+ tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits,
+ tx->tx_msg->ibm_seq);
+}
+
+void
+kibnal_debug_conn (kib_conn_t *conn)
+{
+ struct list_head *tmp;
+ int i;
+
+ spin_lock(&conn->ibc_lock);
+
+ CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n",
+ atomic_read(&conn->ibc_refcount), conn,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n",
+ conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state);
+ CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n",
+ conn->ibc_nsends_posted, conn->ibc_credits,
+ conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+ CDEBUG(D_CONSOLE, " disc %d comms_err %d\n",
+ conn->ibc_disconnect, conn->ibc_comms_error);
+
+ CDEBUG(D_CONSOLE, " early_rxs:\n");
+ list_for_each(tmp, &conn->ibc_early_rxs)
+ kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+ kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+ kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue)
+ kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " active_txs:\n");
+ list_for_each(tmp, &conn->ibc_active_txs)
+ kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " rxs:\n");
+ for (i = 0; i < IBNAL_RX_MSGS; i++)
+ kibnal_debug_rx(&conn->ibc_rxs[i]);
+
+ spin_unlock(&conn->ibc_lock);
+}
+
int
kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
{
static vv_qp_attr_t attr;
-
+
kib_connvars_t *cv = conn->ibc_connvars;
vv_return_t vvrc;
-
+
/* Only called by connd => static OK */
LASSERT (!in_interrupt());
LASSERT (current == kibnal_data.kib_connd);
memset(&attr, 0, sizeof(attr));
-
+
switch (new_state) {
default:
LBUG();
-
+
case vv_qp_state_init: {
struct vv_qp_modify_init_st *init = &attr.modify.params.init;
init->access_control = vv_acc_r_mem_read |
vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */
- attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX |
+ attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX |
VV_QP_AT_PHY_PORT_NUM |
VV_QP_AT_ACCESS_CON_F;
break;
rtr->destanation_qp = cv->cv_remote_qpn;
rtr->receive_psn = cv->cv_rxpsn;
rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
-
- // XXX ? rtr->opt_min_rnr_nak_timer = 16;
+ rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer;
// XXX sdp sets VV_QP_AT_OP_F but no actual optional options
- attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC |
+ attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC |
VV_QP_AT_DEST_QP |
- VV_QP_AT_R_PSN |
+ VV_QP_AT_R_PSN |
VV_QP_AT_MIN_RNR_NAK_T |
VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM |
VV_QP_AT_OP_F;
struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
rts->send_psn = cv->cv_txpsn;
- rts->local_ack_timeout = IBNAL_LOCAL_ACK_TIMEOUT;
- rts->retry_num = IBNAL_RETRY_CNT;
- rts->rnr_num = IBNAL_RNR_CNT;
+ rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout;
+ rts->retry_num = *kibnal_tunables.kib_retry_cnt;
+ rts->rnr_num = *kibnal_tunables.kib_rnr_cnt;
rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
-
+
attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
VV_QP_AT_L_ACK_T |
VV_QP_AT_RETRY_NUM |
attr.modify.vv_qp_attr_mask = 0;
break;
}
-
+
attr.modify.qp_modify_into_state = new_state;
attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE;
-
+
vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
if (vvrc != vv_return_ok) {
- CERROR("Can't modify qp -> "LPX64" state to %d: %d\n",
- conn->ibc_peer->ibp_nid, new_state, vvrc);
+ CERROR("Can't modify qp -> %s state to %d: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ new_state, vvrc);
return -EIO;
}
-
+
return 0;
}
{
kib_conn_t *conn;
int i;
- __u64 vaddr = 0;
- __u64 vaddr_base;
int page_offset;
int ipage;
vv_return_t vvrc;
/* Only the connd creates conns => single threaded */
LASSERT(!in_interrupt());
LASSERT(current == kibnal_data.kib_connd);
-
- PORTAL_ALLOC(conn, sizeof (*conn));
+
+ LIBCFS_ALLOC(conn, sizeof (*conn));
if (conn == NULL) {
CERROR ("Can't allocate connection\n");
return (NULL);
/* zero flags, NULL pointers etc... */
memset (conn, 0, sizeof (*conn));
+ conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */
+
INIT_LIST_HEAD (&conn->ibc_early_rxs);
+ INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
INIT_LIST_HEAD (&conn->ibc_tx_queue);
+ INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
INIT_LIST_HEAD (&conn->ibc_active_txs);
spin_lock_init (&conn->ibc_lock);
-
+
atomic_inc (&kibnal_data.kib_nconns);
/* well not really, but I call destroy() on failure, which decrements */
conn->ibc_cep = cep;
- PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
if (conn->ibc_connvars == NULL) {
CERROR("Can't allocate in-progress connection state\n");
goto failed;
get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
sizeof(conn->ibc_connvars->cv_rxpsn));
- PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+ LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
if (conn->ibc_rxs == NULL) {
CERROR("Cannot allocate RX buffers\n");
goto failed;
if (rc != 0)
goto failed;
- vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
- struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
- kib_rx_t *rx = &conn->ibc_rxs[i];
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
+ vv_mem_reg_h_t mem_h;
+ vv_r_key_t r_key;
rx->rx_conn = conn;
- rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
page_offset);
-#if IBNAL_WHOLE_MEM
- {
- vv_mem_reg_h_t mem_h;
- vv_r_key_t r_key;
-
- /* Voltaire stack already registers the whole
- * memory, so use that API. */
- vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
- rx->rx_msg,
- IBNAL_MSG_SIZE,
- &mem_h,
- &rx->rx_lkey,
- &r_key);
- LASSERT (vvrc == vv_return_ok);
- }
-#else
- rx->rx_vaddr = vaddr;
-#endif
- CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx,
- rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
-
- vaddr += IBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
-
+ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+ rx->rx_msg,
+ IBNAL_MSG_SIZE,
+ &mem_h,
+ &rx->rx_lkey,
+ &r_key);
+ LASSERT (vvrc == vv_return_ok);
+
+ CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx,
+ rx->rx_msg, rx->rx_lkey);
+
page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
reqattr.create.qp_type = vv_qp_type_r_conn;
reqattr.create.cq_send_h = kibnal_data.kib_cq;
reqattr.create.cq_receive_h = kibnal_data.kib_cq;
- reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) *
- IBNAL_MSG_QUEUE_SIZE;
+ reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) *
+ (*kibnal_tunables.kib_concurrent_sends);
reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS;
reqattr.create.max_scatgat_per_send_wr = 1;
reqattr.create.max_scatgat_per_receive_wr = 1;
}
/* Mark QP created */
- conn->ibc_state = IBNAL_CONN_INIT;
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
- if (rspattr.create_return.receive_max_outstand_wr <
- IBNAL_MSG_QUEUE_SIZE ||
- rspattr.create_return.send_max_outstand_wr <
- (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
+ if (rspattr.create_return.receive_max_outstand_wr <
+ IBNAL_RX_MSGS ||
+ rspattr.create_return.send_max_outstand_wr <
+ (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) {
CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
- IBNAL_MSG_QUEUE_SIZE,
- (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
+ IBNAL_RX_MSGS,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
+ (*kibnal_tunables.kib_concurrent_sends),
rspattr.create_return.receive_max_outstand_wr,
rspattr.create_return.send_max_outstand_wr);
goto failed;
}
+ /* Mark init complete */
+ conn->ibc_state = IBNAL_CONN_INIT;
+
/* 1 ref for caller */
atomic_set (&conn->ibc_refcount, 1);
return (conn);
-
+
failed:
kibnal_destroy_conn (conn);
return (NULL);
/* Only the connd does this (i.e. single threaded) */
LASSERT (!in_interrupt());
LASSERT (current == kibnal_data.kib_connd);
-
+
CDEBUG (D_NET, "connection %p\n", conn);
LASSERT (atomic_read (&conn->ibc_refcount) == 0);
LASSERT (list_empty(&conn->ibc_early_rxs));
LASSERT (list_empty(&conn->ibc_tx_queue));
+ LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+ LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
LASSERT (list_empty(&conn->ibc_active_txs));
LASSERT (conn->ibc_nsends_posted == 0);
/* fall through */
case IBNAL_CONN_INIT:
+ vvrc = cm_destroy_cep(conn->ibc_cep);
+ LASSERT (vvrc == vv_return_ok);
+ /* fall through */
+
+ case IBNAL_CONN_INIT_QP:
kibnal_set_qp_state(conn, vv_qp_state_reset);
vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp);
if (vvrc != vv_return_ok)
CERROR("Can't destroy QP: %d\n", vvrc);
/* fall through */
-
+
case IBNAL_CONN_INIT_NOTHING:
break;
}
- if (conn->ibc_rx_pages != NULL)
+ if (conn->ibc_rx_pages != NULL)
kibnal_free_pages(conn->ibc_rx_pages);
if (conn->ibc_rxs != NULL)
- PORTAL_FREE(conn->ibc_rxs,
+ LIBCFS_FREE(conn->ibc_rxs,
IBNAL_RX_MSGS * sizeof(kib_rx_t));
if (conn->ibc_connvars != NULL)
- PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
if (conn->ibc_peer != NULL)
kibnal_peer_decref(conn->ibc_peer);
- vvrc = cm_destroy_cep(conn->ibc_cep);
- LASSERT (vvrc == vv_return_ok);
-
- PORTAL_FREE(conn, sizeof (*conn));
+ LIBCFS_FREE(conn, sizeof (*conn));
atomic_dec(&kibnal_data.kib_nconns);
}
if (conn->ibc_incarnation == incarnation)
continue;
- CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
- peer->ibp_nid, conn->ibc_incarnation, incarnation);
-
+ CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_incarnation, incarnation);
+
count++;
kibnal_close_conn_locked (conn, -ESTALE);
}
}
int
-kibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (lnet_nid_t nid)
{
kib_peer_t *peer;
struct list_head *ptmp;
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- if (nid != PTL_NID_ANY)
+ if (nid != LNET_NID_ANY)
lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0 ||
!list_empty (&peer->ibp_conns));
- if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+ if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
continue;
count += kibnal_close_peer_conns_locked (peer, 0);
write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
/* wildcards always succeed */
- if (nid == PTL_NID_ANY)
+ if (nid == LNET_NID_ANY)
return (0);
-
+
return (count == 0 ? -ENOENT : 0);
}
int
-kibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
{
- int rc = -EINVAL;
+ struct libcfs_ioctl_data *data = arg;
+ int rc = -EINVAL;
- LASSERT (pcfg != NULL);
+ LASSERT (ni == kibnal_data.kib_ni);
- switch(pcfg->pcfg_command) {
- case NAL_CMD_GET_PEER: {
- ptl_nid_t nid = 0;
- __u32 ip = 0;
- int share_count = 0;
+ switch(cmd) {
+ case IOC_LIBCFS_GET_PEER: {
+ lnet_nid_t nid = 0;
+ __u32 ip = 0;
+ int share_count = 0;
- rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ rc = kibnal_get_peer_info(data->ioc_count,
&nid, &ip, &share_count);
- pcfg->pcfg_nid = nid;
- pcfg->pcfg_size = 0;
- pcfg->pcfg_id = ip;
- pcfg->pcfg_misc = IBNAL_SERVICE_NUMBER; /* port */
- pcfg->pcfg_count = 0;
- pcfg->pcfg_wait = share_count;
+ data->ioc_nid = nid;
+ data->ioc_count = share_count;
+ data->ioc_u32[0] = ip;
+ data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */
break;
}
- case NAL_CMD_ADD_PEER: {
- rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
- pcfg->pcfg_id); /* IP */
+ case IOC_LIBCFS_ADD_PEER: {
+ rc = kibnal_add_persistent_peer (data->ioc_nid,
+ data->ioc_u32[0]); /* IP */
break;
}
- case NAL_CMD_DEL_PEER: {
- rc = kibnal_del_peer (pcfg->pcfg_nid,
- /* flags == single_share */
- pcfg->pcfg_flags != 0);
+ case IOC_LIBCFS_DEL_PEER: {
+ rc = kibnal_del_peer (data->ioc_nid);
break;
}
- case NAL_CMD_GET_CONN: {
- kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+ case IOC_LIBCFS_GET_CONN: {
+ kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
if (conn == NULL)
rc = -ENOENT;
else {
+ // kibnal_debug_conn(conn);
rc = 0;
- pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
- pcfg->pcfg_id = 0;
- pcfg->pcfg_misc = 0;
- pcfg->pcfg_flags = 0;
+ data->ioc_nid = conn->ibc_peer->ibp_nid;
kibnal_conn_decref(conn);
}
break;
}
- case NAL_CMD_CLOSE_CONNECTION: {
- rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+ case IOC_LIBCFS_CLOSE_CONNECTION: {
+ rc = kibnal_close_matching_conns (data->ioc_nid);
break;
}
- case NAL_CMD_REGISTER_MYNID: {
- if (pcfg->pcfg_nid == PTL_NID_ANY)
+ case IOC_LIBCFS_REGISTER_MYNID: {
+ if (ni->ni_nid == data->ioc_nid) {
+ rc = 0;
+ } else {
+ CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
rc = -EINVAL;
- else
- rc = kibnal_set_mynid (pcfg->pcfg_nid);
+ }
break;
}
}
kibnal_free_pages (kib_pages_t *p)
{
int npages = p->ibp_npages;
- vv_return_t vvrc;
int i;
-
- if (p->ibp_mapped) {
- vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
- p->ibp_handle);
- if (vvrc != vv_return_ok)
- CERROR ("Deregister error: %d\n", vvrc);
- }
-
+
for (i = 0; i < npages; i++)
if (p->ibp_pages[i] != NULL)
__free_page(p->ibp_pages[i]);
-
- PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+
+ LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
}
int
{
kib_pages_t *p;
int i;
-#if !IBNAL_WHOLE_MEM
- vv_phy_list_t vv_phys;
- vv_phy_buf_t *phys_pages;
- vv_return_t vvrc;
- vv_access_con_bit_mask_t access;
-#endif
- PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+ LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
if (p == NULL) {
CERROR ("Can't allocate buffer %d\n", npages);
return (-ENOMEM);
memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
p->ibp_npages = npages;
-
+
for (i = 0; i < npages; i++) {
p->ibp_pages[i] = alloc_page (GFP_KERNEL);
if (p->ibp_pages[i] == NULL) {
}
}
-#if !IBNAL_WHOLE_MEM
- PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
- if (phys_pages == NULL) {
- CERROR ("Can't allocate physarray for %d pages\n", npages);
- kibnal_free_pages(p);
- return (-ENOMEM);
- }
-
- vv_phys.number_of_buff = npages;
- vv_phys.phy_list = phys_pages;
-
- for (i = 0; i < npages; i++) {
- phys_pages[i].size = PAGE_SIZE;
- phys_pages[i].start = page_to_phys(p->ibp_pages[i]);
- }
-
- VV_ACCESS_CONTROL_MASK_SET_ALL(access);
-
- vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
- &vv_phys,
- 0, /* requested vaddr */
- npages * PAGE_SIZE, 0, /* offset */
- kibnal_data.kib_pd,
- access,
- &p->ibp_handle,
- &p->ibp_vaddr,
- &p->ibp_lkey,
- &p->ibp_rkey);
-
- PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
-
- if (vvrc != vv_return_ok) {
- CERROR ("Error %d mapping %d pages\n", vvrc, npages);
- kibnal_free_pages(p);
- return (-EFAULT);
- }
-
- CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
- "lkey %x rkey %x\n", npages, p->ibp_handle,
- p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-
- p->ibp_mapped = 1;
-#endif
*pp = p;
return (0);
}
int
-kibnal_alloc_tx_descs (void)
+kibnal_alloc_tx_descs (void)
{
int i;
-
- PORTAL_ALLOC (kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+ LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
if (kibnal_data.kib_tx_descs == NULL)
return -ENOMEM;
-
+
memset(kibnal_data.kib_tx_descs, 0,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
- PORTAL_ALLOC(tx->tx_wrq,
- (1 + IBNAL_MAX_RDMA_FRAGS) *
+#if IBNAL_USE_FMR
+ LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_pages == NULL)
+ return -ENOMEM;
+#else
+ LIBCFS_ALLOC(tx->tx_wrq,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_wrq));
if (tx->tx_wrq == NULL)
return -ENOMEM;
-
- PORTAL_ALLOC(tx->tx_gl,
- (1 + IBNAL_MAX_RDMA_FRAGS) *
+
+ LIBCFS_ALLOC(tx->tx_gl,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_gl));
if (tx->tx_gl == NULL)
return -ENOMEM;
-
- PORTAL_ALLOC(tx->tx_rd,
- offsetof(kib_rdma_desc_t,
+
+ LIBCFS_ALLOC(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
rd_frags[IBNAL_MAX_RDMA_FRAGS]));
if (tx->tx_rd == NULL)
return -ENOMEM;
+#endif
}
return 0;
}
void
-kibnal_free_tx_descs (void)
+kibnal_free_tx_descs (void)
{
int i;
if (kibnal_data.kib_tx_descs == NULL)
return;
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+#if IBNAL_USE_FMR
+ if (tx->tx_pages != NULL)
+ LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+#else
if (tx->tx_wrq != NULL)
- PORTAL_FREE(tx->tx_wrq,
- (1 + IBNAL_MAX_RDMA_FRAGS) *
+ LIBCFS_FREE(tx->tx_wrq,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_wrq));
if (tx->tx_gl != NULL)
- PORTAL_FREE(tx->tx_gl,
- (1 + IBNAL_MAX_RDMA_FRAGS) *
+ LIBCFS_FREE(tx->tx_gl,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
sizeof(*tx->tx_gl));
if (tx->tx_rd != NULL)
- PORTAL_FREE(tx->tx_rd,
- offsetof(kib_rdma_desc_t,
+ LIBCFS_FREE(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
}
- PORTAL_FREE(kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ LIBCFS_FREE(kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+}
+
+#if IBNAL_USE_FMR
+void
+kibnal_free_fmrs (int n)
+{
+ int i;
+ vv_return_t vvrc;
+ kib_tx_t *tx;
+
+ for (i = 0; i < n; i++) {
+ tx = &kibnal_data.kib_tx_descs[i];
+
+ vvrc = vv_free_fmr(kibnal_data.kib_hca,
+ tx->tx_md.md_fmrhandle);
+ if (vvrc != vv_return_ok)
+ CWARN("vv_free_fmr[%d]: %d\n", i, vvrc);
+ }
}
+#endif
int
kibnal_setup_tx_descs (void)
{
- int ipage = 0;
- int page_offset = 0;
- __u64 vaddr;
- __u64 vaddr_base;
- struct page *page;
- kib_tx_t *tx;
- int i;
- int rc;
+ int ipage = 0;
+ int page_offset = 0;
+ struct page *page;
+ kib_tx_t *tx;
+ vv_mem_reg_h_t mem_h;
+ vv_r_key_t rkey;
+ vv_return_t vvrc;
+ int i;
+ int rc;
+#if IBNAL_USE_FMR
+ vv_fmr_t fmr_props;
+#endif
/* pre-mapped messages are not bigger than 1 page */
CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
/* No fancy arithmetic when we do the buffer calculations */
CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
- rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
- 0);
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+ IBNAL_TX_MSG_PAGES(), 0);
if (rc != 0)
return (rc);
- /* ignored for the whole_mem case */
- vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
tx = &kibnal_data.kib_tx_descs[i];
- tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
- page_offset);
-#if IBNAL_WHOLE_MEM
- {
- vv_mem_reg_h_t mem_h;
- vv_r_key_t rkey;
- vv_return_t vvrc;
-
- /* Voltaire stack already registers the whole
- * memory, so use that API. */
- vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
- tx->tx_msg,
- IBNAL_MSG_SIZE,
- &mem_h,
- &tx->tx_lkey,
- &rkey);
- LASSERT (vvrc == vv_return_ok);
+#if IBNAL_USE_FMR
+ memset(&fmr_props, 0, sizeof(fmr_props));
+ fmr_props.pd_hndl = kibnal_data.kib_pd;
+ fmr_props.acl = (vv_acc_r_mem_write |
+ vv_acc_l_mem_write);
+ fmr_props.max_pages = LNET_MAX_IOV;
+ fmr_props.log2_page_sz = PAGE_SHIFT;
+ fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps;
+
+ vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
+ &fmr_props,
+ &tx->tx_md.md_fmrhandle);
+ if (vvrc != vv_return_ok) {
+ CERROR("Can't allocate fmr %d: %d\n", i, vvrc);
+ kibnal_free_fmrs(i);
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
+ return -ENOMEM;
}
-#else
- tx->tx_vaddr = vaddr;
+
+ tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
+ tx->tx_md.md_active = 0;
#endif
- tx->tx_isnblk = (i >= IBNAL_NTX);
- tx->tx_mapped = KIB_TX_UNMAPPED;
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
- CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx,
- tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
+ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+ tx->tx_msg,
+ IBNAL_MSG_SIZE,
+ &mem_h,
+ &tx->tx_lkey,
+ &rkey);
+ LASSERT (vvrc == vv_return_ok);
- if (tx->tx_isnblk)
- list_add (&tx->tx_list,
- &kibnal_data.kib_idle_nblk_txs);
- else
- list_add (&tx->tx_list,
- &kibnal_data.kib_idle_txs);
+ CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx,
+ tx->tx_msg, tx->tx_lkey);
- vaddr += IBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+ list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
}
}
-
+
return (0);
}
void
-kibnal_api_shutdown (nal_t *nal)
+kibnal_shutdown (lnet_ni_t *ni)
{
- int i;
- vv_return_t vvrc;
+ int i;
+ vv_return_t vvrc;
- if (nal->nal_refct != 0) {
- /* This module got the first ref */
- PORTAL_MODULE_UNUSE;
- return;
- }
+ LASSERT (ni == kibnal_data.kib_ni);
+ LASSERT (ni->ni_data == &kibnal_data);
CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
- atomic_read (&portal_kmemory));
-
- LASSERT(nal == &kibnal_api);
+ atomic_read (&libcfs_kmemory));
switch (kibnal_data.kib_init) {
case IBNAL_INIT_ALL:
- /* stop calls to nal_cmd */
- libcfs_nal_cmd_unregister(VIBNAL);
- /* No new peers */
+ /* stop accepting connections and prevent new peers */
+ kibnal_stop_listener(ni);
- /* resetting my NID removes my listener and nukes all current
- * peers and their connections */
- kibnal_set_mynid (PTL_NID_ANY);
+ /* nuke all existing peers */
+ kibnal_del_peer(LNET_NID_ANY);
/* Wait for all peer state to clean up */
i = 2;
- while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+ while (atomic_read(&kibnal_data.kib_npeers) != 0) {
i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
"waiting for %d peers to disconnect\n",
- atomic_read (&kibnal_data.kib_npeers));
- set_current_state (TASK_UNINTERRUPTIBLE);
- schedule_timeout (HZ);
+ atomic_read(&kibnal_data.kib_npeers));
+ cfs_pause(cfs_time_seconds(1));
}
/* fall through */
case IBNAL_INIT_TXD:
kibnal_free_pages (kibnal_data.kib_tx_pages);
+#if IBNAL_USE_FMR
+ kibnal_free_fmrs(IBNAL_TX_MSGS());
+#endif
/* fall through */
case IBNAL_INIT_PD:
-#if !IBNAL_WHOLE_MEM
+#if 0
+ /* Only deallocate a PD if we actually allocated one */
vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
kibnal_data.kib_pd);
if (vvrc != vv_return_ok)
kibnal_async_callback);
if (vvrc != vv_return_ok)
CERROR("vv_dell_async_event_cb error: %d\n", vvrc);
-
+
/* fall through */
case IBNAL_INIT_HCA:
CERROR ("Close HCA error: %d\n", vvrc);
/* fall through */
- case IBNAL_INIT_LIB:
- lib_fini(&kibnal_lib);
- /* fall through */
-
case IBNAL_INIT_DATA:
- LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
LASSERT (kibnal_data.kib_peers != NULL);
for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
LASSERT (list_empty (&kibnal_data.kib_peers[i]));
}
LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
- LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
- LASSERT (list_empty (&kibnal_data.kib_sched_txq));
LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
LASSERT (list_empty (&kibnal_data.kib_connd_conns));
LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"Waiting for %d threads to terminate\n",
atomic_read (&kibnal_data.kib_nthreads));
- set_current_state (TASK_INTERRUPTIBLE);
- schedule_timeout (HZ);
+ cfs_pause(cfs_time_seconds(1));
}
/* fall through */
-
+
case IBNAL_INIT_NOTHING:
break;
}
kibnal_free_tx_descs();
if (kibnal_data.kib_peers != NULL)
- PORTAL_FREE (kibnal_data.kib_peers,
- sizeof (struct list_head) *
+ LIBCFS_FREE (kibnal_data.kib_peers,
+ sizeof (struct list_head) *
kibnal_data.kib_peer_hash_size);
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
- atomic_read (&portal_kmemory));
- printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
- atomic_read(&portal_kmemory));
+ atomic_read (&libcfs_kmemory));
kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+ PORTAL_MODULE_UNUSE;
}
int
-kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
- ptl_ni_limits_t *requested_limits,
- ptl_ni_limits_t *actual_limits)
+kibnal_startup (lnet_ni_t *ni)
{
+ char scratch[32];
+ char ipif_name[32];
+ char *hca_name;
+ __u32 ip;
+ __u32 netmask;
+ int up;
+ int nob;
+ int devno;
struct timeval tv;
- ptl_process_id_t process_id;
- int pkmem = atomic_read(&portal_kmemory);
int rc;
int i;
vv_request_event_record_t req_er;
vv_return_t vvrc;
- LASSERT (nal == &kibnal_api);
+ LASSERT (ni->ni_lnd == &the_kiblnd);
+
+ /* Only 1 instance supported */
+ if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
+ CERROR ("Only 1 instance supported\n");
+ return -EPERM;
+ }
- if (nal->nal_refct != 0) {
- if (actual_limits != NULL)
- *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
- /* This module got the first ref */
- PORTAL_MODULE_USE;
- return (PTL_OK);
+ if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+ CERROR ("Can't set credits(%d) > ntx(%d)\n",
+ *kibnal_tunables.kib_credits,
+ *kibnal_tunables.kib_ntx);
+ return -EINVAL;
}
- LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+ ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+ ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+ CLASSERT (LNET_MAX_INTERFACES > 1);
+
+ if (ni->ni_interfaces[0] != NULL) {
+ /* Use the HCA specified in 'networks=' */
+
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Multiple interfaces not supported\n");
+ return -EPERM;
+ }
+
+ /* Parse <hca base name><number> */
+ hca_name = ni->ni_interfaces[0];
+ nob = strlen(*kibnal_tunables.kib_hca_basename);
+
+ if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) ||
+ sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) {
+ CERROR("Unrecognised HCA %s\n", hca_name);
+ return -EINVAL;
+ }
+
+ } else {
+ /* Use <hca base name>0 */
+ devno = 0;
+
+ hca_name = scratch;
+ snprintf(hca_name, sizeof(scratch), "%s%d",
+ *kibnal_tunables.kib_hca_basename, devno);
+ if (strlen(hca_name) == sizeof(scratch) - 1) {
+ CERROR("HCA name %s truncated\n", hca_name);
+ return -EINVAL;
+ }
+ }
+
+ /* Find IP address from <ipif base name><hca number> */
+ snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+ *kibnal_tunables.kib_ipif_basename, devno);
+ if (strlen(ipif_name) == sizeof(ipif_name) - 1) {
+ CERROR("IPoIB interface name %s truncated\n", ipif_name);
+ return -EINVAL;
+ }
+
+ rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+ return -ENETDOWN;
+ }
+
+ if (!up) {
+ CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
+ return -ENETDOWN;
+ }
+
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
+
+ PORTAL_MODULE_USE;
memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
-
+
+ kibnal_data.kib_ni = ni;
+ ni->ni_data = &kibnal_data;
+
do_gettimeofday(&tv);
kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
-
- init_MUTEX (&kibnal_data.kib_nid_mutex);
rwlock_init(&kibnal_data.kib_global_lock);
kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
- PORTAL_ALLOC (kibnal_data.kib_peers,
+ LIBCFS_ALLOC (kibnal_data.kib_peers,
sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
if (kibnal_data.kib_peers == NULL) {
goto failed;
init_waitqueue_head (&kibnal_data.kib_connd_waitq);
spin_lock_init (&kibnal_data.kib_sched_lock);
- INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
- INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
init_waitqueue_head (&kibnal_data.kib_sched_waitq);
spin_lock_init (&kibnal_data.kib_tx_lock);
INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
- INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
- init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
rc = kibnal_alloc_tx_descs();
if (rc != 0) {
CERROR("Can't allocate tx descs\n");
goto failed;
}
-
+
/* lists/ptrs/locks initialised */
kibnal_data.kib_init = IBNAL_INIT_DATA;
/*****************************************************/
- process_id.pid = requested_pid;
- process_id.nid = PTL_NID_ANY;
-
- rc = lib_init(&kibnal_lib, nal, process_id,
- requested_limits, actual_limits);
- if (rc != PTL_OK) {
- CERROR("lib_init failed: error %d\n", rc);
- goto failed;
- }
-
- /* lib interface initialised */
- kibnal_data.kib_init = IBNAL_INIT_LIB;
- /*****************************************************/
-
for (i = 0; i < IBNAL_N_SCHED; i++) {
rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
if (rc != 0) {
goto failed;
}
- /* TODO: apparently only one adapter is supported */
- vvrc = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca);
+ vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca);
if (vvrc != vv_return_ok) {
- CERROR ("Can't open CA: %d\n", vvrc);
+ CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc);
goto failed;
}
vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
kibnal_async_callback);
if (vvrc != vv_return_ok) {
- CERROR ("Can't open CA: %d\n", vvrc);
+ CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc);
goto failed;
}
vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
if (vvrc != vv_return_ok) {
- CERROR ("Can't size port attrs: %d\n", vvrc);
+ CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc);
goto failed;
}
vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
if (vvrc != vv_return_ok) {
- CERROR("vv_port_query failed for port %d: %d\n",
- port_num, vvrc);
+ CERROR("vv_port_query failed for %s port %d: %d\n",
+ hca_name, port_num, vvrc);
continue;
}
CDEBUG(D_NET, "port[%d] Active\n", port_num);
/* Found a suitable port. Get its GUID and PKEY. */
- kibnal_data.kib_port = port_num;
-
tbl_count = 1;
- vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca,
+ vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca,
port_num, &tbl_count,
&kibnal_data.kib_port_gid);
if (vvrc != vv_return_ok) {
CERROR("vv_get_port_gid_tbl failed "
- "for port %d: %d\n", port_num, vvrc);
+ "for %s port %d: %d\n",
+ hca_name, port_num, vvrc);
continue;
}
tbl_count = 1;
- vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca,
- port_num, &tbl_count,
- &kibnal_data.kib_port_pkey);
+ vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca,
+ port_num, &tbl_count,
+ &kibnal_data.kib_port_pkey);
if (vvrc != vv_return_ok) {
CERROR("vv_get_port_partition_tbl failed "
- "for port %d: %d\n", port_num, vvrc);
+ "for %s port %d: %d\n",
+ hca_name, port_num, vvrc);
continue;
}
+ kibnal_data.kib_port = port_num;
+
break;
case vv_state_linkActDefer: /* TODO: correct? */
case vv_state_linkNoChange:
- CERROR("Unexpected port[%d] state %d\n",
- i, pattr->port_state);
+ CERROR("Unexpected %s port[%d] state %d\n",
+ hca_name, i, pattr->port_state);
continue;
}
break;
}
if (kibnal_data.kib_port == -1) {
- CERROR ("Can't find an active port\n");
+ CERROR ("Can't find an active port on %s\n", hca_name);
goto failed;
}
- CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
- kibnal_data.kib_port,
- kibnal_data.kib_port_gid.scope.g.subnet,
+ CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n",
+ hca_name, kibnal_data.kib_port,
+ kibnal_data.kib_port_gid.scope.g.subnet,
kibnal_data.kib_port_gid.scope.g.eui64);
-
+
/*****************************************************/
-#if !IBNAL_WHOLE_MEM
- vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
-#else
+#if 1
+ /* We use a pre-allocated PD */
vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#else
+ vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
#endif
- if (vvrc != 0) {
- CERROR ("Can't create PD: %d\n", vvrc);
+ if (vvrc != vv_return_ok) {
+ CERROR ("Can't init PD: %d\n", vvrc);
goto failed;
}
-
+
/* flag PD initialised */
kibnal_data.kib_init = IBNAL_INIT_PD;
/*****************************************************/
CERROR ("Can't register tx descs: %d\n", rc);
goto failed;
}
-
+
/* flag TX descs initialised */
kibnal_data.kib_init = IBNAL_INIT_TXD;
/*****************************************************/
+
{
- uint32_t nentries;
+ __u32 nentries;
- vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
- kibnal_cq_callback,
+ vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
+ kibnal_cq_callback,
NULL, /* context */
&kibnal_data.kib_cq, &nentries);
if (vvrc != 0) {
/* flag CQ initialised */
kibnal_data.kib_init = IBNAL_INIT_CQ;
- if (nentries < IBNAL_CQ_ENTRIES) {
- CERROR ("CQ only has %d entries, need %d\n",
- nentries, IBNAL_CQ_ENTRIES);
+ if (nentries < IBNAL_CQ_ENTRIES()) {
+ CERROR ("CQ only has %d entries, need %d\n",
+ nentries, IBNAL_CQ_ENTRIES());
goto failed;
}
- vvrc = vv_request_completion_notification(kibnal_data.kib_hca,
- kibnal_data.kib_cq,
+ vvrc = vv_request_completion_notification(kibnal_data.kib_hca,
+ kibnal_data.kib_cq,
vv_next_solicit_unsolicit_event);
if (vvrc != 0) {
CERROR ("Failed to re-arm completion queue: %d\n", rc);
goto failed;
}
}
-
- /*****************************************************/
- rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
+ rc = kibnal_start_listener(ni);
if (rc != 0) {
- CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+ CERROR("Can't start listener: %d\n", rc);
goto failed;
}
kibnal_data.kib_init = IBNAL_INIT_ALL;
/*****************************************************/
- printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
- "(initial mem %d)\n", pkmem);
-
- return (PTL_OK);
+ return (0);
failed:
- CDEBUG(D_NET, "kibnal_api_startup failed\n");
- kibnal_api_shutdown (&kibnal_api);
- return (PTL_FAIL);
+ CDEBUG(D_NET, "kibnal_startup failed\n");
+ kibnal_shutdown (ni);
+ return (-ENETDOWN);
}
void __exit
kibnal_module_fini (void)
{
-#ifdef CONFIG_SYSCTL
- if (kibnal_tunables.kib_sysctl != NULL)
- unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
- PtlNIFini(kibnal_ni);
-
- ptl_unregister_nal(VIBNAL);
+ lnet_unregister_lnd(&the_kiblnd);
+ kibnal_tunables_fini();
}
int __init
vibnal_assert_wire_constants();
- CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
+ CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
<= cm_REQ_priv_data_len);
- CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
+ CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)
<= cm_REP_priv_data_len);
+ CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE);
+#if !IBNAL_USE_FMR
CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
<= IBNAL_MSG_SIZE);
CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
<= IBNAL_MSG_SIZE);
-
- /* the following must be sizeof(int) for proc_dointvec() */
- CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
-
- kibnal_api.nal_ni_init = kibnal_api_startup;
- kibnal_api.nal_ni_fini = kibnal_api_shutdown;
-
- /* Initialise dynamic tunables to defaults once only */
- kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+#endif
+ rc = kibnal_tunables_init();
+ if (rc != 0)
+ return rc;
- rc = ptl_register_nal(VIBNAL, &kibnal_api);
- if (rc != PTL_OK) {
- CERROR("Can't register IBNAL: %d\n", rc);
- return (-ENOMEM); /* or something... */
- }
+ lnet_register_lnd(&the_kiblnd);
- /* Pure gateways want the NAL started up at module load time... */
- rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
- if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
- ptl_unregister_nal(VIBNAL);
- return (-ENODEV);
- }
-
-#ifdef CONFIG_SYSCTL
- /* Press on regardless even if registering sysctl doesn't work */
- kibnal_tunables.kib_sysctl =
- register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
- return (0);
+ return 0;
}
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00");
MODULE_LICENSE("GPL");
module_init(kibnal_module_init);
module_exit(kibnal_module_fini);
-