Whamcloud - gitweb
Introduce .gitignore files.
[fs/lustre-release.git] / lnet / klnds / openiblnd / openiblnd.c
index 378fff9..5c3dc50 100644 (file)
@@ -1,62 +1,61 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Copyright (C) 2004 Cluster File Systems, Inc.
- *   Author: Eric Barton <eric@bartonsoftware.com>
+ * GPL HEADER START
  *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/openiblnd/openiblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
-#include "openibnal.h"
-
-nal_t                   kibnal_api;
-ptl_handle_ni_t         kibnal_ni;
-kib_data_t              kibnal_data;
-kib_tunables_t          kibnal_tunables;
-
-#define IBNAL_SYSCTL             202
-
-enum {
-        IBNAL_SYSCTL_TIMEOUT=1,
-        IBNAL_SYSCTL_LISTENER_TIMEOUT,
-        IBNAL_SYSCTL_BACKLOG,
-        IBNAL_SYSCTL_PORT
-};
+#include "openiblnd.h"
 
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", 
-         &kibnal_tunables.kib_listener_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_BACKLOG, "backlog",
-         &kibnal_tunables.kib_backlog, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        {IBNAL_SYSCTL_PORT, "port",
-         &kibnal_tunables.kib_port, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        { 0 }
+lnd_t the_kiblnd = {
+#ifdef USING_TSAPI
+        .lnd_type       = CIBLND,
+#else
+        .lnd_type       = OPENIBLND,
+#endif
+        .lnd_startup    = kibnal_startup,
+        .lnd_shutdown   = kibnal_shutdown,
+        .lnd_ctl        = kibnal_ctl,
+        .lnd_send       = kibnal_send,
+        .lnd_recv       = kibnal_recv,
+        .lnd_eager_recv = kibnal_eager_recv,
+        .lnd_accept     = kibnal_accept,
 };
 
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
+kib_data_t              kibnal_data;
 
 __u32 
 kibnal_cksum (void *ptr, int nob)
@@ -79,31 +78,34 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
 }
 
 void
-kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
+kibnal_pack_msg(kib_msg_t *msg, int version, int credits, 
+                lnet_nid_t dstnid, __u64 dststamp)
 {
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
         msg->ibm_magic    = IBNAL_MSG_MAGIC;
-        msg->ibm_version  = IBNAL_MSG_VERSION;
+        msg->ibm_version  = version;
         /*   ibm_type */
         msg->ibm_credits  = credits;
         /*   ibm_nob */
         msg->ibm_cksum    = 0;
-        msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
+        msg->ibm_srcnid   = kibnal_data.kib_ni->ni_nid;
         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
         msg->ibm_dstnid   = dstnid;
         msg->ibm_dststamp = dststamp;
-#if IBNAL_CKSUM
-        /* NB ibm_cksum zero while computing cksum */
-        msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
+        }
 }
 
 int
-kibnal_unpack_msg(kib_msg_t *msg, int nob)
+kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob)
 {
         const int hdr_size = offsetof(kib_msg_t, ibm_u);
         __u32     msg_cksum;
+        int       msg_version;
         int       flip;
         int       msg_nob;
 
@@ -121,9 +123,12 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 return -EPROTO;
         }
 
-        if (msg->ibm_version != 
-            (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
-                CERROR("Bad version: %d\n", msg->ibm_version);
+        msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+        if ((expected_version == 0) ?
+            (msg_version != IBNAL_MSG_VERSION &&
+             msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) :
+            (msg_version != expected_version)) {
+                CERROR("Bad version: %x\n", msg_version);
                 return -EPROTO;
         }
 
@@ -151,7 +156,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
         
         if (flip) {
                 /* leave magic unflipped as a clue to peer endianness */
-                __swab16s(&msg->ibm_version);
+                msg->ibm_version = msg_version;
                 LASSERT (sizeof(msg->ibm_type) == 1);
                 LASSERT (sizeof(msg->ibm_credits) == 1);
                 msg->ibm_nob = msg_nob;
@@ -161,6 +166,11 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 __swab64s(&msg->ibm_dststamp);
         }
         
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+                return -EPROTO;
+        }
+
         switch (msg->ibm_type) {
         default:
                 CERROR("Unknown message type %x\n", msg->ibm_type);
@@ -230,223 +240,11 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kibnal_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        struct iovec  iov = {
-                .iov_base = buffer,
-                .iov_len  = nob
-        };
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &iov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = MSG_DONTWAIT
-        };
-
-        /* We've set up the socket's send buffer to be large enough for
-         * everything we send, so a single non-blocking send should
-         * complete without error. */
-
-        set_fs(KERNEL_DS);
-        rc = sock_sendmsg(sock, &msg, iov.iov_len);
-        set_fs(oldmm);
-
-        if (rc == nob)
-                return 0;
-
-        if (rc >= 0)
-                return -EAGAIN;
-
-        return rc;
-}
-
-int
-kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
-{
-        int            rc;
-        mm_segment_t   oldmm = get_fs();
-        long           ticks = timeout * HZ;
-        unsigned long  then;
-        struct timeval tv;
-
-        LASSERT (nob > 0);
-        LASSERT (ticks > 0);
-
-        for (;;) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                /* Set receive timeout to remaining time */
-                tv = (struct timeval) {
-                        .tv_sec = ticks / HZ,
-                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
-                };
-                set_fs(KERNEL_DS);
-                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                     (char *)&tv, sizeof(tv));
-                set_fs(oldmm);
-                if (rc != 0) {
-                        CERROR("Can't set socket recv timeout %d: %d\n",
-                               timeout, rc);
-                        return rc;
-                }
-
-                set_fs(KERNEL_DS);
-                then = jiffies;
-                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
-                ticks -= jiffies - then;
-                set_fs(oldmm);
-
-                if (rc < 0)
-                        return rc;
-
-                if (rc == 0)
-                        return -ECONNABORTED;
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-
-                if (nob == 0)
-                        return 0;
-
-                if (ticks <= 0)
-                        return -ETIMEDOUT;
-        }
-}
-
-int
-kibnal_create_sock(struct socket **sockp)
-{
-        struct socket       *sock;
-        int                  rc;
-        int                  option;
-        mm_segment_t         oldmm = get_fs();
-
-        rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR("Can't create socket: %d\n", rc);
-                return rc;
-        }
-
-        /* Ensure sends will not block */
-        option = 2 * sizeof(kib_msg_t);
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set send buffer %d: %d\n", option, rc);
-                goto failed;
-        }
-
-        option = 1;
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR: %d\n", rc);
-                goto failed;
-        }
-
-        *sockp = sock;
-        return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-void
-kibnal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
-
-int
-kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
-        unsigned int        port;
-        int                 rc;
-
-        for (port = 1023; port >= 512; port--) {
-
-                memset(&locaddr, 0, sizeof(locaddr)); 
-                locaddr.sin_family      = AF_INET; 
-                locaddr.sin_port        = htons(port);
-                locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-
-                memset (&srvaddr, 0, sizeof (srvaddr));
-                srvaddr.sin_family      = AF_INET;
-                srvaddr.sin_port        = htons (peer->ibp_port);
-                srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip);
-
-                rc = kibnal_create_sock(&sock);
-                if (rc != 0)
-                        return rc;
-
-                rc = sock->ops->bind(sock,
-                                     (struct sockaddr *)&locaddr, sizeof(locaddr));
-                if (rc != 0) {
-                        sock_release(sock);
-                        
-                        if (rc == -EADDRINUSE) {
-                                CDEBUG(D_NET, "Port %d already in use\n", port);
-                                continue;
-                        }
-
-                        CERROR("Can't bind to reserved port %d: %d\n", port, rc);
-                        return rc;
-                }
-
-                rc = sock->ops->connect(sock,
-                                        (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                        0);
-                if (rc == 0) {
-                        *sockp = sock;
-                        return 0;
-                }
-                
-                sock_release(sock);
-
-                if (rc != -EADDRNOTAVAIL) {
-                        CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
-                               port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
-                        return rc;
-                }
-                
-                CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", 
-                       port, HIPQUAD(peer->ibp_ip), peer->ibp_port);
-        }
-
-        /* all ports busy */
-        return -EHOSTUNREACH;
-}
-
-int
 kibnal_make_svcqry (kib_conn_t *conn) 
 {
         kib_peer_t    *peer = conn->ibc_peer;
+        int            version = IBNAL_MSG_VERSION;
+        int            msg_version;
         kib_msg_t     *msg;
         struct socket *sock;
         int            rc;
@@ -455,115 +253,204 @@ kibnal_make_svcqry (kib_conn_t *conn)
         LASSERT (conn->ibc_connreq != NULL);
         msg = &conn->ibc_connreq->cr_msg;
 
+ again:
         kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
-        kibnal_pack_msg(msg, 0, peer->ibp_nid, 0);
+        kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0);
 
-        rc = kibnal_connect_sock(peer, &sock);
+        rc = lnet_connect(&sock, peer->ibp_nid,
+                          0, peer->ibp_ip, peer->ibp_port);
         if (rc != 0)
-                return rc;
+                return -ECONNABORTED;
         
-        rc = kibnal_sock_write(sock, msg, msg->ibm_nob);
+        rc = libcfs_sock_write(sock, msg, msg->ibm_nob,
+                               lnet_acceptor_timeout());
         if (rc != 0) {
-                CERROR("Error %d sending svcqry to "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid)
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 goto out;
         }
 
-        nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp);
-        rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout);
+        /* The first 6 bytes are invariably MAGIC + proto version */
+        rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout);
         if (rc != 0) {
-                CERROR("Error %d receiving svcrsp from "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid)
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 goto out;
         }
 
-        rc = kibnal_unpack_msg(msg, nob);
+        if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
+            msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n",
+                       msg->ibm_magic, libcfs_nid2str(peer->ibp_nid),
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                rc = -EPROTO;
+                goto out;
+        }
+
+        msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? 
+                      msg->ibm_version : __swab16(msg->ibm_version);
+        if (msg_version != version) {
+                if (version == IBNAL_MSG_VERSION) {
+                        /* retry with previous version */
+                        libcfs_sock_release(sock);
+                        version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD;
+                        goto again;
+                }
+                
+                CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n",
+                       msg_version, libcfs_nid2str(peer->ibp_nid),
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                rc = -EPROTO;
+                goto out;
+        }
+
+        /* Read in the rest of the message now we know the expected format */
+        nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t);
+        rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6,
+                              *kibnal_tunables.kib_timeout);
         if (rc != 0) {
-                CERROR("Error %d unpacking svcrsp from "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                goto out;
+        }
+
+        rc = kibnal_unpack_msg(msg, version, nob);
+        if (rc != 0) {
+                CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 goto out;
         }
                        
         if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
-                CERROR("Unexpected response type %d from "
-                       LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_type
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n", 
+                       msg->ibm_type, libcfs_nid2str(peer->ibp_nid)
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
         
-        if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
+        if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid ||
             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
-                CERROR("Unexpected dst NID/stamp "LPX64"/"LPX64" from "
-                       LPX64"@%u.%u.%u.%u/%d\n", 
-                       msg->ibm_dstnid, msg->ibm_dststamp,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Unexpected dst NID/stamp %s/"LPX64" from "
+                       "%s at %u.%u.%u.%u/%d\n", 
+                       libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp,
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
+                       peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
 
-        if (msg->ibm_srcnid != peer->ibp_nid) {
-                CERROR("Unexpected src NID "LPX64" from "
-                       LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_srcnid,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+        if (peer->ibp_nid != msg->ibm_srcnid) {
+                CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n", 
+                       libcfs_nid2str(msg->ibm_srcnid),
+                       libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
 
         conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
+        conn->ibc_version = version;
+        
  out:
-        sock_release(sock);
+        libcfs_sock_release(sock);
         return rc;
 }
 
 void
 kibnal_handle_svcqry (struct socket *sock)
 {
-        struct sockaddr_in   addr;
         __u32                peer_ip;
         unsigned int         peer_port;
         kib_msg_t           *msg;
         __u64                srcnid;
         __u64                srcstamp;
-        int                  len;
+        int                  version;
+        int                  reject = 0;
         int                  rc;
 
-        len = sizeof(addr);
-        rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
         if (rc != 0) {
                 CERROR("Can't get peer's IP: %d\n", rc);
                 return;
         }
 
-        peer_ip = ntohl(addr.sin_addr.s_addr);
-        peer_port = ntohs(addr.sin_port);
-
-        if (peer_port >= 1024) {
-                CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
+        LIBCFS_ALLOC(msg, sizeof(*msg));
+        if (msg == NULL) {
+                CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
                        HIPQUAD(peer_ip), peer_port);
                 return;
         }
+        
+        rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic),
+                              lnet_acceptor_timeout());
+        if (rc != 0) {
+                CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
 
-        PORTAL_ALLOC(msg, sizeof(*msg));
-        if (msg == NULL) {
-                CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
-                       HIPQUAD(peer_ip), peer_port);
+        if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
+            msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                /* Unexpected magic! */
+                if (msg->ibm_magic == LNET_PROTO_MAGIC ||
+                    msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) {
+                        /* future protocol version compatibility!  When LNET
+                         * unifies protocols over all LNDs, the first thing
+                         * sent will be a version query.  I send back a reply
+                         * in my current protocol to tell her I'm "old" */
+                        kibnal_init_msg(msg, 0, 0);
+                        kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, 
+                                        LNET_NID_ANY, 0);
+                        reject = 1;
+                        goto reply;
+                }
+
+                CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+                        "%u.%u.%u.%u/%d\n", msg->ibm_magic,
+                        IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
+
+        /* Now check version */
+
+        rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version),
+                              lnet_acceptor_timeout());
+        if (rc != 0) {
+                CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
+
+        version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
+                  msg->ibm_version : __swab16(msg->ibm_version);
+        /* Peer is a different protocol version: reply in my current protocol
+         * to tell her I'm "old" */
+        if (version != IBNAL_MSG_VERSION &&
+            version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                kibnal_init_msg(msg, 0, 0);
+                kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0);
+                reject = 1;
+                goto reply;
+        }
         
-        rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
-                              kibnal_tunables.kib_listener_timeout);
+        /* Now read in all the rest */
+        rc = libcfs_sock_read(sock, &msg->ibm_type,
+                              offsetof(kib_msg_t, ibm_u) -
+                              offsetof(kib_msg_t, ibm_type),
+                              lnet_acceptor_timeout());
         if (rc != 0) {
-                CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n",
+                CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
         
-        rc = kibnal_unpack_msg(msg, offsetof(kib_msg_t, ibm_u));
+        rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u));
         if (rc != 0) {
                 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
@@ -576,10 +463,10 @@ kibnal_handle_svcqry (struct socket *sock)
                 goto out;
         }
         
-        if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
-                CERROR("Unexpected dstnid "LPX64"(expected "LPX64" "
-                       "from %u.%u.%u.%u/%d\n", msg->ibm_dstnid,
-                       kibnal_lib.libnal_ni.ni_pid.nid,
+        if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid) {
+                CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n",
+                       libcfs_nid2str(msg->ibm_dstnid),
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
                        HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
@@ -594,265 +481,49 @@ kibnal_handle_svcqry (struct socket *sock)
                sizeof(kibnal_data.kib_svc_gid));
         msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
 
-        kibnal_pack_msg(msg, 0, srcnid, srcstamp);
-        
-        rc = kibnal_sock_write (sock, msg, msg->ibm_nob);
-        if (rc != 0) {
+        kibnal_pack_msg(msg, version, 0, srcnid, srcstamp);
+
+ reply:
+        rc = libcfs_sock_write (sock, msg, msg->ibm_nob,
+                                lnet_acceptor_timeout());
+        if (!reject && rc != 0) {
+                /* Only complain if we're not rejecting */
                 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
-                goto out;
         }
         
  out:
-        PORTAL_FREE(msg, sizeof(*msg));
+        LIBCFS_FREE(msg, sizeof(*msg));
 }
 
 void
 kibnal_free_acceptsock (kib_acceptsock_t *as)
 {
-        sock_release(as->ibas_sock);
-        PORTAL_FREE(as, sizeof(*as));
+        libcfs_sock_release(as->ibas_sock);
+        LIBCFS_FREE(as, sizeof(*as));
 }
 
 int
-kibnal_ip_listener(void *arg)
+kibnal_accept(lnet_ni_t *ni, struct socket *sock)
 {
-        struct sockaddr_in addr;
-        wait_queue_t       wait;
-        struct socket     *sock;
         kib_acceptsock_t  *as;
-        int                port;
-        char               name[16];
-        int                rc;
         unsigned long      flags;
 
-        /* Parent thread holds kib_nid_mutex, and is, or is about to
-         * block on kib_listener_signal */
-
-        port = kibnal_tunables.kib_port;
-        snprintf(name, sizeof(name), "kibnal_lstn%03d", port);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
-
-        init_waitqueue_entry(&wait, current);
-
-        rc = kibnal_create_sock(&sock);
-        if (rc != 0)
-                goto out_0;
-
-        memset(&addr, 0, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_port        = htons(port);
-        addr.sin_addr.s_addr = INADDR_ANY;
-
-        rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
-        if (rc != 0) {
-                CERROR("Can't bind to port %d\n", port);
-                goto out_1;
+        LIBCFS_ALLOC(as, sizeof(*as));
+        if (as == NULL) {
+                CERROR("Out of Memory\n");
+                return -ENOMEM;
         }
 
-        rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog);
-        if (rc != 0) {
-                CERROR("Can't set listen backlog %d: %d\n", 
-                       kibnal_tunables.kib_backlog, rc);
-                goto out_1;
-        }
-
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-        kibnal_data.kib_listener_sock = sock;
-
-        /* unblock waiting parent */
-        LASSERT (kibnal_data.kib_listener_shutdown == 0);
-        up(&kibnal_data.kib_listener_signal);
-
-        /* Wake me any time something happens on my socket */
-        add_wait_queue(sock->sk->sk_sleep, &wait);
-        as = NULL;
-
-        while (kibnal_data.kib_listener_shutdown == 0) {
-
-                if (as == NULL) {
-                        PORTAL_ALLOC(as, sizeof(*as));
-                        if (as == NULL) {
-                                CERROR("Out of Memory: pausing...\n");
-                                kibnal_pause(HZ);
-                                continue;
-                        }
-                        as->ibas_sock = NULL;
-                }
-
-                if (as->ibas_sock == NULL) {
-                        as->ibas_sock = sock_alloc();
-                        if (as->ibas_sock == NULL) {
-                                CERROR("Can't allocate socket: pausing...\n");
-                                kibnal_pause(HZ);
-                                continue;
-                        }
-                        /* XXX this should add a ref to sock->ops->owner, if
-                         * TCP could be a module */
-                        as->ibas_sock->type = sock->type;
-                        as->ibas_sock->ops = sock->ops;
-                }
-                
-                set_current_state(TASK_INTERRUPTIBLE);
-
-                rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK);
-
-                /* Sleep for socket activity? */
-                if (rc == -EAGAIN &&
-                    kibnal_data.kib_listener_shutdown == 0)
-                        schedule();
-
-                set_current_state(TASK_RUNNING);
-
-                if (rc == 0) {
-                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-                        
-                        list_add_tail(&as->ibas_list, 
-                                      &kibnal_data.kib_connd_acceptq);
-
-                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-                        wake_up(&kibnal_data.kib_connd_waitq);
-
-                        as = NULL;
-                        continue;
-                }
+        as->ibas_sock = sock;
                 
-                if (rc != -EAGAIN) {
-                        CERROR("Accept failed: %d, pausing...\n", rc);
-                        kibnal_pause(HZ);
-                }
-        }
-
-        if (as != NULL) {
-                if (as->ibas_sock != NULL)
-                        sock_release(as->ibas_sock);
-                PORTAL_FREE(as, sizeof(*as));
-        }
-
-        rc = 0;
-        remove_wait_queue(sock->sk->sk_sleep, &wait);
- out_1:
-        sock_release(sock);
-        kibnal_data.kib_listener_sock = NULL;
- out_0:
-        /* set completion status and unblock thread waiting for me 
-         * (parent on startup failure, executioner on normal shutdown) */
-        kibnal_data.kib_listener_shutdown = rc;
-        up(&kibnal_data.kib_listener_signal);
-
-        return 0;
-}
-
-int
-kibnal_start_ip_listener (void)
-{
-        long           pid;
-        int            rc;
-
-        CDEBUG(D_WARNING, "Starting listener\n");
-
-        /* Called holding kib_nid_mutex: listener stopped */
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-
-        kibnal_data.kib_listener_shutdown = 0;
-        pid = kernel_thread(kibnal_ip_listener, NULL, 0);
-        if (pid < 0) {
-                CERROR("Can't spawn listener: %ld\n", pid);
-                return (int)pid;
-        }
-
-        /* Block until listener has started up. */
-        down(&kibnal_data.kib_listener_signal);
-
-        rc = kibnal_data.kib_listener_shutdown;
-        LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL));
-
-        CDEBUG(D_WARNING, "Listener %ld started OK\n", pid);
-        return rc;
-}
-
-void
-kibnal_stop_ip_listener(int clear_acceptq)
-{
-        struct list_head  zombie_accepts;
-        kib_acceptsock_t *as;
-        unsigned long     flags;
-
-        CDEBUG(D_WARNING, "Stopping listener\n");
-
-        /* Called holding kib_nid_mutex: listener running */
-        LASSERT (kibnal_data.kib_listener_sock != NULL);
-
-        kibnal_data.kib_listener_shutdown = 1;
-        wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep);
-
-        /* Block until listener has torn down. */
-        down(&kibnal_data.kib_listener_signal);
-
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-        CDEBUG(D_WARNING, "Listener stopped\n");
-
-        if (!clear_acceptq)
-                return;
-
-        /* Close any unhandled accepts */
         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-
-        list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq);
-        list_del_init(&kibnal_data.kib_connd_acceptq);
+                
+        list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq);
+        wake_up(&kibnal_data.kib_connd_waitq);
 
         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-        
-        while (!list_empty(&zombie_accepts)) {
-                as = list_entry(zombie_accepts.next,
-                                kib_acceptsock_t, ibas_list);
-                list_del(&as->ibas_list);
-                kibnal_free_acceptsock(as);
-        }
-}
-
-int 
-kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp)
-{
-        int   *tunable = (int *)table->data;
-        int    old_val;
-        int    rc;
-
-        /* No race with nal initialisation since the nal is setup all the time
-         * it's loaded.  When that changes, change this! */
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-
-        down(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (tunable == &kibnal_tunables.kib_port ||
-                 tunable == &kibnal_tunables.kib_backlog);
-        old_val = *tunable;
-
-        rc = proc_dointvec(table, write, filp, buffer, lenp);
-
-        if (write &&
-            (*tunable != old_val ||
-             kibnal_data.kib_listener_sock == NULL)) {
-
-                if (kibnal_data.kib_listener_sock != NULL)
-                        kibnal_stop_ip_listener(0);
-
-                rc = kibnal_start_ip_listener();
-
-                if (rc != 0) {
-                        CERROR("Unable to restart listener with new tunable:"
-                               " reverting to old value\n");
-                        *tunable = old_val;
-                        kibnal_start_ip_listener();
-                }
-        }
-
-        up(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-        return rc;
+        return 0;
 }
 
 int
@@ -912,78 +583,19 @@ kibnal_stop_ib_listener (void)
 }
 
 int
-kibnal_set_mynid (ptl_nid_t nid)
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
 {
-        lib_ni_t         *ni = &kibnal_lib.libnal_ni;
-        int               rc;
-
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
-
-        down (&kibnal_data.kib_nid_mutex);
-
-        if (nid == kibnal_data.kib_nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
-
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               kibnal_data.kib_nid, nid);
+        kib_peer_t     *peer;
+        unsigned long   flags;
+        int             rc;
 
-        if (kibnal_data.kib_listener_sock != NULL)
-                kibnal_stop_ip_listener(1);
-        
-        if (kibnal_data.kib_listen_handle != NULL)
-                kibnal_stop_ib_listener();
+        LASSERT (nid != LNET_NID_ANY);
 
-        ni->ni_pid.nid = nid;
-        kibnal_data.kib_incarnation++;
-        mb();
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave new
-         * world. */
-        kibnal_del_peer (PTL_NID_ANY, 0);
-
-        if (ni->ni_pid.nid != PTL_NID_ANY) {
-                /* got a new NID to install */
-                rc = kibnal_start_ib_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IB listener: %d\n", rc);
-                        goto failed_0;
-                }
-        
-                rc = kibnal_start_ip_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IP listener: %d\n", rc);
-                        goto failed_1;
-                }
+        LIBCFS_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
         }
-        
-        up(&kibnal_data.kib_nid_mutex);
-        return 0;
-
- failed_1:
-        kibnal_stop_ib_listener();
- failed_0:
-        ni->ni_pid.nid = PTL_NID_ANY;
-        kibnal_data.kib_incarnation++;
-        mb();
-        kibnal_del_peer (PTL_NID_ANY, 0);
-        up(&kibnal_data.kib_nid_mutex);
-        return rc;
-}
-
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
-{
-        kib_peer_t *peer;
-
-        LASSERT (nid != PTL_NID_ANY);
-
-        PORTAL_ALLOC (peer, sizeof (*peer));
-        if (peer == NULL)
-                return (NULL);
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -993,53 +605,65 @@ kibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_list);       /* not in the peer table yet */
         INIT_LIST_HEAD (&peer->ibp_conns);
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
+        INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
 
-        peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_error = 0;
+        peer->ibp_last_alive = cfs_time_current();
+        peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        return (peer);
+        if (atomic_read(&kibnal_data.kib_npeers) >=
+            *kibnal_tunables.kib_concurrent_peers) {
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        } else if (kibnal_data.kib_nonewpeers) {
+                rc = -ESHUTDOWN;        /* shutdown has started */
+        } else {
+                rc = 0;
+                /* npeers only grows with kib_global_lock held */
+                atomic_inc(&kibnal_data.kib_npeers);
+        }
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Can't create peer: %s\n", 
+                       (rc == -ESHUTDOWN) ? "shutting down" : 
+                       "too many peers");
+                LIBCFS_FREE(peer, sizeof(*peer));
+        } else {
+                *peerp = peer;
+        }
+        
+        return rc;
 }
 
 void
 kibnal_destroy_peer (kib_peer_t *peer)
 {
-        CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
+        CDEBUG (D_NET, "peer %s %p deleted\n", 
+                libcfs_nid2str(peer->ibp_nid), peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
+        LASSERT (peer->ibp_accepting == 0);
+        LASSERT (list_empty (&peer->ibp_connd_list));
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
 
-        PORTAL_FREE (peer, sizeof (*peer));
+        LIBCFS_FREE (peer, sizeof (*peer));
 
         /* NB a peer's connections keep a reference on their peer until
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&kibnal_data.kib_npeers);
-}
-
-void
-kibnal_put_peer (kib_peer_t *peer)
-{
-        CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
-                peer, peer->ibp_nid,
-                atomic_read (&peer->ibp_refcount));
-
-        LASSERT (atomic_read (&peer->ibp_refcount) > 0);
-        if (!atomic_dec_and_test (&peer->ibp_refcount))
-                return;
-
-        kibnal_destroy_peer (peer);
+        atomic_dec(&kibnal_data.kib_npeers);
 }
 
 kib_peer_t *
-kibnal_find_peer_locked (ptl_nid_t nid)
+kibnal_find_peer_locked (lnet_nid_t nid)
 {
         struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
@@ -1051,28 +675,28 @@ kibnal_find_peer_locked (ptl_nid_t nid)
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
+                         peer->ibp_accepting != 0 ||
                          !list_empty (&peer->ibp_conns));  /* active conn */
 
                 if (peer->ibp_nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read (&peer->ibp_refcount));
                 return (peer);
         }
         return (NULL);
 }
 
 kib_peer_t *
-kibnal_get_peer (ptl_nid_t nid)
+kibnal_get_peer (lnet_nid_t nid)
 {
         kib_peer_t     *peer;
+        unsigned long   flags;
 
-        read_lock (&kibnal_data.kib_global_lock);
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
         peer = kibnal_find_peer_locked (nid);
         if (peer != NULL)                       /* +1 ref for caller? */
-                atomic_inc (&peer->ibp_refcount);
-        read_unlock (&kibnal_data.kib_global_lock);
+                kibnal_peer_addref(peer);
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         return (peer);
 }
@@ -1086,18 +710,19 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
         LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        kibnal_put_peer (peer);
+        kibnal_peer_decref(peer);
 }
 
 int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
+kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
                       int *persistencep)
 {
         kib_peer_t        *peer;
         struct list_head  *ptmp;
+        unsigned long      flags;
         int                i;
 
-        read_lock (&kibnal_data.kib_global_lock);
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
 
@@ -1106,6 +731,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         if (index-- > 0)
@@ -1116,34 +742,40 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
                         *portp = peer->ibp_port;
                         *persistencep = peer->ibp_persistence;
                         
-                        read_unlock (&kibnal_data.kib_global_lock);
+                        read_unlock_irqrestore(&kibnal_data.kib_global_lock,
+                                               flags);
                         return (0);
                 }
         }
 
-        read_unlock (&kibnal_data.kib_global_lock);
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
         return (-ENOENT);
 }
 
 int
-kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
+kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
 {
         unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        int                rc;
         
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer (&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
+        /* I'm always called with a reference on kibnal_data.kib_ni
+         * so shutdown can't have started */
+        LASSERT (kibnal_data.kib_nonewpeers == 0);
+
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                kibnal_put_peer (peer);
+                kibnal_peer_decref(peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
@@ -1160,19 +792,13 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
 }
 
 void
-kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
         kib_conn_t       *conn;
 
-        if (!single_share)
-                peer->ibp_persistence = 0;
-        else if (peer->ibp_persistence > 0)
-                peer->ibp_persistence--;
-
-        if (peer->ibp_persistence != 0)
-                return;
+        peer->ibp_persistence = 0;
 
         if (list_empty(&peer->ibp_conns)) {
                 kibnal_unlink_peer_locked(peer);
@@ -1190,9 +816,10 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 }
 
 int
-kibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
+        CFS_LIST_HEAD     (zombies);
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         kib_peer_t        *peer;
@@ -1203,7 +830,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -1215,21 +842,27 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        kibnal_del_peer_locked (peer, single_share);
-                        rc = 0;         /* matched something */
+                        if (!list_empty(&peer->ibp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ibp_conns));
+
+                                list_splice_init(&peer->ibp_tx_queue, &zombies);
+                        }
 
-                        if (single_share)
-                                goto out;
+                        kibnal_del_peer_locked (peer);
+                        rc = 0;         /* matched something */
                 }
         }
- out:
+
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
+        kibnal_txlist_done(&zombies, -EIO);
+
         return (rc);
 }
 
@@ -1240,9 +873,10 @@ kibnal_get_conn_by_idx (int index)
         struct list_head  *ptmp;
         kib_conn_t        *conn;
         struct list_head  *ctmp;
+        unsigned long      flags;
         int                i;
 
-        read_lock (&kibnal_data.kib_global_lock);
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
         for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
@@ -1250,6 +884,7 @@ kibnal_get_conn_by_idx (int index)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         list_for_each (ctmp, &peer->ibp_conns) {
@@ -1257,17 +892,15 @@ kibnal_get_conn_by_idx (int index)
                                         continue;
 
                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
-                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                                       atomic_read (&conn->ibc_refcount));
-                                atomic_inc (&conn->ibc_refcount);
-                                read_unlock (&kibnal_data.kib_global_lock);
+                                kibnal_conn_addref(conn);
+                                read_unlock_irqrestore(&kibnal_data.kib_global_lock,
+                                                       flags);
                                 return (conn);
                         }
                 }
         }
 
-        read_unlock (&kibnal_data.kib_global_lock);
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
         return (NULL);
 }
 
@@ -1286,7 +919,7 @@ kibnal_create_conn (void)
                 struct ib_qp_attribute     qp_attr;
         } params;
         
-        PORTAL_ALLOC (conn, sizeof (*conn));
+        LIBCFS_ALLOC (conn, sizeof (*conn));
         if (conn == NULL) {
                 CERROR ("Can't allocate connection\n");
                 return (NULL);
@@ -1295,14 +928,16 @@ kibnal_create_conn (void)
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
 
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
         atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL)
                 goto failed;
         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
@@ -1336,11 +971,13 @@ kibnal_create_conn (void)
                 }
         }
 
+        /* We can post up to IBNAL_RX_MSGS, which may also include an
+         * additional RDMA work item */
+
         params.qp_create = (struct ib_qp_create_param) {
                 .limit = {
-                        /* Sends have an optional RDMA */
-                        .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
-                        .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_send_request    = 2 * IBNAL_RX_MSGS,
+                        .max_outstanding_receive_request = IBNAL_RX_MSGS,
                         .max_send_gather_element         = 1,
                         .max_receive_scatter_element     = 1,
                 },
@@ -1397,6 +1034,8 @@ kibnal_destroy_conn (kib_conn_t *conn)
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
         LASSERT (conn->ibc_connreq == NULL);
@@ -1422,13 +1061,13 @@ kibnal_destroy_conn (kib_conn_t *conn)
                 kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
-                PORTAL_FREE(conn->ibc_rxs, 
+                LIBCFS_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_peer != NULL)
-                kibnal_put_peer(conn->ibc_peer);
+                kibnal_peer_decref(conn->ibc_peer);
 
-        PORTAL_FREE(conn, sizeof (*conn));
+        LIBCFS_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
         
@@ -1441,30 +1080,6 @@ kibnal_destroy_conn (kib_conn_t *conn)
         }
 }
 
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
-                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                atomic_read (&conn->ibc_refcount));
-
-        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ibc_refcount))
-                return;
-
-        /* last ref only goes on zombies */
-        LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
-
-        spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
-
-        list_add (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
-        wake_up (&kibnal_data.kib_reaper_waitq);
-
-        spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
-}
-
 int
 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
@@ -1497,8 +1112,10 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
                 if (conn->ibc_incarnation == incarnation)
                         continue;
 
-                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
-                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                CDEBUG(D_NET, "Closing stale conn %p nid: %s"
+                       " incarnation:"LPX64"("LPX64")\n", conn,
+                       libcfs_nid2str(peer->ibp_nid), 
+                       conn->ibc_incarnation, incarnation);
                 
                 count++;
                 kibnal_close_conn_locked (conn, -ESTALE);
@@ -1508,7 +1125,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 }
 
 int
-kibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (lnet_nid_t nid)
 {
         unsigned long       flags;
         kib_peer_t         *peer;
@@ -1521,7 +1138,7 @@ kibnal_close_matching_conns (ptl_nid_t nid)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -1534,9 +1151,10 @@ kibnal_close_matching_conns (ptl_nid_t nid)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
                         count += kibnal_close_peer_conns_locked (peer, 0);
@@ -1546,72 +1164,71 @@ kibnal_close_matching_conns (ptl_nid_t nid)
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (0);
         
         return (count == 0 ? -ENOENT : 0);
 }
 
 int
-kibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-        int rc = -EINVAL;
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
 
-        LASSERT (pcfg != NULL);
+        LASSERT (ni == kibnal_data.kib_ni);
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t   nid = 0;
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
                 __u32       ip = 0;
                 int         port = 0;
                 int         share_count = 0;
 
-                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &ip, &port, &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = ip;
-                pcfg->pcfg_misc  = port;
-                pcfg->pcfg_count = 0;
-                pcfg->pcfg_wait  = share_count;
+                data->ioc_nid    = nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
                 break;
         }
-        case NAL_CMD_ADD_PEER: {
-                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
-                                                 pcfg->pcfg_id, /* IP */
-                                                 pcfg->pcfg_misc); /* port */
+        case IOC_LIBCFS_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (data->ioc_nid,
+                                                 data->ioc_u32[0], /* IP */
+                                                 data->ioc_u32[1]); /* port */
                 break;
         }
-        case NAL_CMD_DEL_PEER: {
-                rc = kibnal_del_peer (pcfg->pcfg_nid, 
-                                       /* flags == single_share */
-                                       pcfg->pcfg_flags != 0);
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kibnal_del_peer (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_GET_CONN: {
-                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
-                        pcfg->pcfg_id    = 0;
-                        pcfg->pcfg_misc  = 0;
-                        pcfg->pcfg_flags = 0;
-                        kibnal_put_conn (conn);
+                        data->ioc_nid = conn->ibc_peer->ibp_nid;
+                        kibnal_conn_decref(conn);
                 }
                 break;
         }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_REGISTER_MYNID: {
-                if (pcfg->pcfg_nid == PTL_NID_ANY)
+        case IOC_LIBCFS_REGISTER_MYNID: {
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                }
                 break;
         }
         }
@@ -1636,7 +1253,7 @@ kibnal_free_pages (kib_pages_t *p)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
@@ -1647,7 +1264,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
         int                         i;
         int                         rc;
 
-        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
@@ -1665,7 +1282,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
                 }
         }
 
-        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
         if (phys_pages == NULL) {
                 CERROR ("Can't allocate physarray for %d pages\n", npages);
                 kibnal_free_pages(p);
@@ -1675,7 +1292,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
         for (i = 0; i < npages; i++) {
                 phys_pages[i].size = PAGE_SIZE;
                 phys_pages[i].address =
-                        kibnal_page2phys(p->ibp_pages[i]);
+                        lnet_page2phys(p->ibp_pages[i]);
         }
 
         p->ibp_vaddr = 0;
@@ -1688,7 +1305,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
                                          &p->ibp_lkey,
                                          &p->ibp_rkey);
         
-        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
         
         if (rc != 0) {
                 CERROR ("Error %d mapping %d pages\n", rc, npages);
@@ -1720,14 +1337,14 @@ kibnal_setup_tx_descs (void)
         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
-                                IBNAL_TX_MSG_PAGES, 
+                                IBNAL_TX_MSG_PAGES()
                                 0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
@@ -1735,21 +1352,15 @@ kibnal_setup_tx_descs (void)
                 
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= IBNAL_NTX);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
                        i, tx, tx->tx_msg, tx->tx_vaddr);
 
-                if (tx->tx_isnblk)
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_nblk_txs);
-                else
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
                 vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -1757,7 +1368,7 @@ kibnal_setup_tx_descs (void)
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
@@ -1765,21 +1376,17 @@ kibnal_setup_tx_descs (void)
 }
 
 void
-kibnal_api_shutdown (nal_t *nal)
+kibnal_shutdown (lnet_ni_t *ni)
 {
-        int   i;
-        int   rc;
-
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
+        int           i;
+        int           rc;
+        unsigned long flags;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
-        LASSERT(nal == &kibnal_api);
+        LASSERT(ni == kibnal_data.kib_ni);
+        LASSERT(ni->ni_data == &kibnal_data);
 
         switch (kibnal_data.kib_init) {
         default:
@@ -1787,23 +1394,39 @@ kibnal_api_shutdown (nal_t *nal)
                 LBUG();
 
         case IBNAL_INIT_ALL:
-                /* stop calls to nal_cmd */
-                libcfs_nal_cmd_unregister(OPENIBNAL);
-                /* No new peers */
+                /* Prevent new peers from being created */
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                kibnal_data.kib_nonewpeers = 1;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-                /* resetting my NID unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kibnal_set_mynid (PTL_NID_ANY);
+                kibnal_stop_ib_listener();
+
+                /* Remove all existing peers from the peer table */
+                kibnal_del_peer(LNET_NID_ANY);
+                
+                /* Wait for pending conn reqs to be handled */
+                i = 2;
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                while (!list_empty(&kibnal_data.kib_connd_acceptq)) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, 
+                                               flags);
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for conn reqs to clean up\n");
+                        cfs_pause(cfs_time_seconds(1));
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                }
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                while (atomic_read(&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&kibnal_data.kib_npeers));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                               atomic_read(&kibnal_data.kib_npeers));
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
 
@@ -1829,14 +1452,10 @@ kibnal_api_shutdown (nal_t *nal)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
-        case IBNAL_INIT_LIB:
-                lib_fini(&kibnal_lib);
-                /* fall through */
-
         case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
@@ -1860,8 +1479,7 @@ kibnal_api_shutdown (nal_t *nal)
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
                                atomic_read (&kibnal_data.kib_nthreads));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
                 
@@ -1870,57 +1488,135 @@ kibnal_api_shutdown (nal_t *nal)
         }
 
         if (kibnal_data.kib_tx_descs != NULL)
-                PORTAL_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+                LIBCFS_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 
         if (kibnal_data.kib_peers != NULL)
-                PORTAL_FREE (kibnal_data.kib_peers,
+                LIBCFS_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
                              kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-        printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
 }
 
 int
-kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                     ptl_ni_limits_t *requested_limits,
-                     ptl_ni_limits_t *actual_limits)
+kibnal_get_ipoibidx(void)
 {
+        /* NB single threaded! */
+        static struct ib_port_properties port_props;
+
+        int               ipoibidx = 0;
+        int               devidx;
+        int               port;
+        int               rc;
+        struct ib_device *device;
+
+        for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) {
+                device = ib_device_get_by_index(devidx);
+                
+                if (device == NULL) {
+                        CERROR("Can't get IB device %d\n", devidx);
+                        return -1;
+                }
+                
+                for (port = 1; port <= 2; port++) {
+                        if (devidx == kibnal_data.kib_hca_idx &&
+                            port == kibnal_data.kib_port)
+                                return ipoibidx;
+                        
+                        rc = ib_port_properties_get(device, port,
+                                                    &port_props);
+                        if (rc == 0)
+                                ipoibidx++;
+                }
+        }
+
+        LBUG();
+        return -1;
+}
+
+int
+kibnal_startup (lnet_ni_t *ni)
+{
+        char              ipif_name[32];
+        __u32             ip;
+        __u32             netmask;
+        int               up;
         struct timeval    tv;
-        ptl_process_id_t  process_id;
-        int               pkmem = atomic_read(&portal_kmemory);
         int               rc;
+        int               hca;
+        int               port;
         int               i;
+        int               nob;
 
-        LASSERT (nal == &kibnal_api);
+        LASSERT (ni->ni_lnd == &the_kiblnd);
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return (PTL_OK);
+        /* Only 1 instance supported */
+        if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
         }
 
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+        if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kibnal_tunables.kib_credits,
+                        *kibnal_tunables.kib_ntx);
+                return -EINVAL;
+        }
 
         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
 
+        ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+        ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+        CLASSERT (LNET_MAX_INTERFACES > 1);
+
+
+        kibnal_data.kib_hca_idx = 0;            /* default: first HCA */
+        kibnal_data.kib_port = 0;               /* any port */
+
+        if (ni->ni_interfaces[0] != NULL) {
+                /* hca.port specified in 'networks=openib(h.p)' */
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        return -EPERM;
+                }
+                
+                nob = strlen(ni->ni_interfaces[0]);
+                i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob);
+                if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) {
+                        kibnal_data.kib_hca_idx = hca;
+                        kibnal_data.kib_port = port;
+                } else {
+                        nob = strlen(ni->ni_interfaces[0]);
+                        i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob);
+
+                        if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) {
+                                kibnal_data.kib_hca_idx = hca;
+                        } else {
+                                CERROR("Can't parse interface '%s'\n",
+                                       ni->ni_interfaces[0]);
+                                return -EINVAL;
+                        }
+                }
+        }
+        
+        kibnal_data.kib_ni = ni;
+        ni->ni_data = &kibnal_data;
+        
         do_gettimeofday(&tv);
         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        init_MUTEX (&kibnal_data.kib_nid_mutex);
-        init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
+        PORTAL_MODULE_USE;
 
         rwlock_init(&kibnal_data.kib_global_lock);
 
         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (kibnal_data.kib_peers,
+        LIBCFS_ALLOC (kibnal_data.kib_peers,
                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
         if (kibnal_data.kib_peers == NULL) {
                 goto failed;
@@ -1944,11 +1640,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
-        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
-        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
-        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
         if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
@@ -1958,21 +1652,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
-
-        process_id.pid = requested_pid;
-        process_id.nid = PTL_NID_ANY;           /* don't know my NID yet */
-        
-        rc = lib_init(&kibnal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                goto failed;
-        }
-
-        /* lib interface initialised */
-        kibnal_data.kib_init = IBNAL_INIT_LIB;
-        /*****************************************************/
-
         for (i = 0; i < IBNAL_N_SCHED; i++) {
                 rc = kibnal_thread_start (kibnal_scheduler,
                                           (void *)((unsigned long)i));
@@ -1983,7 +1662,13 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        for (i = 0; i < IBNAL_N_CONND; i++) {
+        /* must have at least 2 connds to remain responsive to svcqry while
+         * connecting */
+        if (*kibnal_tunables.kib_n_connd < 2)
+                *kibnal_tunables.kib_n_connd = 2;
+
+
+        for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) {
                 rc = kibnal_thread_start (kibnal_connd,
                                           (void *)((unsigned long)i));
                 if (rc != 0) {
@@ -1999,9 +1684,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 goto failed;
         }
 
-        kibnal_data.kib_device = ib_device_get_by_index(0);
+        kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx);
         if (kibnal_data.kib_device == NULL) {
-                CERROR ("Can't open ib device 0\n");
+                CERROR ("Can't open ib device %d\n",
+                        kibnal_data.kib_hca_idx);
                 goto failed;
         }
         
@@ -2016,19 +1702,54 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                kibnal_data.kib_device_props.max_initiator_per_qp,
                kibnal_data.kib_device_props.max_responder_per_qp);
 
-        kibnal_data.kib_port = 0;
-        for (i = 1; i <= 2; i++) {
-                rc = ib_port_properties_get(kibnal_data.kib_device, i,
+        if (kibnal_data.kib_port != 0) {
+                rc = ib_port_properties_get(kibnal_data.kib_device, 
+                                            kibnal_data.kib_port,
                                             &kibnal_data.kib_port_props);
-                if (rc == 0) {
-                        kibnal_data.kib_port = i;
-                        break;
+                if (rc != 0) {
+                        CERROR("Error %d open port %d on HCA %d\n", rc,
+                               kibnal_data.kib_port,
+                               kibnal_data.kib_hca_idx);
+                        goto failed;
+                }
+        } else {
+                for (i = 1; i <= 2; i++) {
+                        rc = ib_port_properties_get(kibnal_data.kib_device, i,
+                                                    &kibnal_data.kib_port_props);
+                        if (rc == 0) {
+                                kibnal_data.kib_port = i;
+                                break;
+                        }
                 }
+                if (kibnal_data.kib_port == 0) {
+                        CERROR ("Can't find a port\n");
+                        goto failed;
+                }
+        }
+
+        i = kibnal_get_ipoibidx();
+        if (i < 0)
+                goto failed;
+        
+        snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+                 *kibnal_tunables.kib_ipif_basename, i);
+        if (strlen(ipif_name) == sizeof(ipif_name) - 1) {
+                CERROR("IPoIB interface name %s truncated\n", ipif_name);
+                return -EINVAL;
         }
-        if (kibnal_data.kib_port == 0) {
-                CERROR ("Can't find a port\n");
+        
+        rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+                goto failed;
+        }
+        
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
                 goto failed;
         }
+        
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
 
         rc = ib_pd_create(kibnal_data.kib_device,
                           NULL, &kibnal_data.kib_pd);
@@ -2042,9 +1763,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         /*****************************************************/
 #if IBNAL_FMR
         {
-                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                const int pool_size = *kibnal_tunables.kib_ntx;
                 struct ib_fmr_pool_param params = {
-                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
                                               IB_ACCESS_REMOTE_WRITE |
                                               IB_ACCESS_REMOTE_READ),
@@ -2087,7 +1808,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         },
                         .arg            = NULL,
                 };
-                int  nentries = IBNAL_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES();
                 
                 rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
@@ -2101,39 +1822,31 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
-        
+
         /* flag CQ initialised */
         kibnal_data.kib_init = IBNAL_INIT_CQ;
         /*****************************************************/
-        
-        rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
-        if (rc != 0) {
-                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-                goto failed;
-        }
 
+        rc = kibnal_start_ib_listener();
+        if (rc != 0)
+                goto failed;
+        
         /* flag everything initialised */
         kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
-        printk(KERN_INFO "Lustre: OpenIB NAL loaded "
-               "(initial mem %d)\n", pkmem);
-
-        return (PTL_OK);
+        return 0;
 
  failed:
-        kibnal_api_shutdown (&kibnal_api);    
-        return (PTL_FAIL);
+        kibnal_shutdown(ni);    
+        return -ENETDOWN;
 }
 
 void __exit
 kibnal_module_fini (void)
 {
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-        PtlNIFini(kibnal_ni);
-
-        ptl_unregister_nal(OPENIBNAL);
+        lnet_unregister_lnd(&the_kiblnd);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -2141,50 +1854,22 @@ kibnal_module_init (void)
 {
         int    rc;
 
-        /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int));
-
-        kibnal_api.nal_ni_init = kibnal_api_startup;
-        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
-        kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT;
-        kibnal_tunables.kib_backlog = IBNAL_BACKLOG;
-        kibnal_tunables.kib_port = IBNAL_PORT;
-
-        rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register IBNAL: %d\n", rc);
-                return (-ENOMEM);               /* or something... */
-        }
-
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(OPENIBNAL);
-                return (-ENODEV);
-        }
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
         
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-        if (kibnal_tunables.kib_sysctl == NULL) {
-                CERROR("Can't register sysctl table\n");
-                PtlNIFini(kibnal_ni);
-                ptl_unregister_nal(OPENIBNAL);
-                return (-ENOMEM);
-        }
+        lnet_register_lnd(&the_kiblnd);
 
         return (0);
 }
 
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+#ifdef USING_TSAPI
+MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00");
+#else
+MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00");
+#endif
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
 module_exit(kibnal_module_fini);
-