Whamcloud - gitweb
* Added interface and socket queries to libcfs (libcfs/linux-tcpip.c). Some
authoreeb <eeb>
Wed, 1 Jun 2005 21:23:21 +0000 (21:23 +0000)
committereeb <eeb>
Wed, 1 Jun 2005 21:23:21 +0000 (21:23 +0000)
     of this came from socknal_lib-linux.c but was generalised a little so all
     the NALs that use TCP/IP to do connection establishment can use it.

     CAVEAT EMPTOR! Just the linux versions are done; the darwin versions are
     not implemented yet.

     Changed socknal and ranal to use this.

*    Brought ranal up-to-date with new config + mod params for all tunables.
     Ranal gets its local address from the IP of a (single) specified
     interface, or the first "suitable" one found if no interface is specified.

22 files changed:
lnet/include/libcfs/libcfs.h
lnet/include/libcfs/linux/Makefile.am
lnet/include/libcfs/linux/libcfs.h
lnet/include/libcfs/linux/linux-tcpip.h [new file with mode: 0644]
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-p30.h
lnet/klnds/ralnd/Makefile.in
lnet/klnds/ralnd/ralnd.c
lnet/klnds/ralnd/ralnd.h
lnet/klnds/ralnd/ralnd_cb.c
lnet/klnds/ralnd/ralnd_modparams.c [new file with mode: 0644]
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_lib-linux.c
lnet/klnds/socklnd/socklnd_lib-linux.h
lnet/libcfs/Makefile.in
lnet/libcfs/linux/Makefile.am
lnet/libcfs/linux/linux-tcpip.c [new file with mode: 0644]
lnet/libcfs/nidstrings.c
lnet/lnet/config.c
lnet/utils/portals.c

index ec8d5b1..c77a495 100644 (file)
@@ -273,6 +273,25 @@ struct libcfs_ioctl_handler {
 int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
 int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
 
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(struct socket **newsockp, struct socket *sock,
+                       int bufsize);
+void libcfs_sock_abort_accept(struct socket *sock);
+int libcfs_sock_connect(struct socket **sockp, int *fatal, int bufsize,
+                        __u32 local_ip, int local_port,
+                        __u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(struct socket *sock);
+
+void libcfs_pause(cfs_duration_t ticks);
+
 /* libcfs watchdogs */
 struct lc_watchdog;
 
index 159cf57..6d1e241 100644 (file)
@@ -1,3 +1,3 @@
 EXTRA_DIST := kp30.h libcfs.h linux-fs.h linux-lock.h linux-mem.h      \
-       linux-prim.h linux-time.h lltrace.h portals_compat25.h          \
-       portals_lib.h portals_utils.h
+       linux-prim.h linux-time.h linux-tcpip.h lltrace.h               \
+       portals_compat25.h portals_lib.h portals_utils.h
index cd48871..e62ac48 100644 (file)
@@ -13,6 +13,7 @@
 #include <libcfs/linux/linux-prim.h>
 #include <libcfs/linux/linux-lock.h>
 #include <libcfs/linux/linux-fs.h>
+#include <libcfs/linux/linux-tcpip.h>
 
 #ifdef HAVE_ASM_TYPES_H
 #include <asm/types.h>
diff --git a/lnet/include/libcfs/linux/linux-tcpip.h b/lnet/include/libcfs/linux/linux-tcpip.h
new file mode 100644 (file)
index 0000000..fce2ede
--- /dev/null
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+#include <net/sock.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
+# define sk_allocation  allocation
+# define sk_data_ready data_ready
+# define sk_write_space write_space
+# define sk_user_data   user_data
+# define sk_prot        prot
+# define sk_sndbuf      sndbuf
+# define sk_socket      socket
+# define sk_sleep       sleep
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
+# define sk_wmem_queued wmem_queued
+# define sk_err         err
+#endif
+
+#define SOCK_SNDBUF(so)         ((so)->sk->sk_sndbuf)
+#define SOCK_WMEM_QUEUED(so)    ((so)->sk->sk_wmem_queued)
+#define SOCK_ERROR(so)          ((so)->sk->sk_err)
+#define SOCK_TEST_NOSPACE(so)  test_bit(SOCK_NOSPACE, &(so)->flags)
+
+#endif
+
+#endif
index 61e6dc0..ef8d896 100644 (file)
@@ -460,6 +460,7 @@ extern void ptl_md_deconstruct(ptl_libmd_t *lmd, ptl_md_t *umd);
 #ifdef __KERNEL__
 extern void ptl_register_nal(ptl_nal_t *nal);
 extern void ptl_unregister_nal(ptl_nal_t *nal);
+extern ptl_err_t ptl_set_ip_niaddr (ptl_ni_t *ni);
 #endif
 
 extern ptl_err_t ptl_parse_routes (char *route_str);
index 61e6dc0..ef8d896 100644 (file)
@@ -460,6 +460,7 @@ extern void ptl_md_deconstruct(ptl_libmd_t *lmd, ptl_md_t *umd);
 #ifdef __KERNEL__
 extern void ptl_register_nal(ptl_nal_t *nal);
 extern void ptl_unregister_nal(ptl_nal_t *nal);
+extern ptl_err_t ptl_set_ip_niaddr (ptl_ni_t *ni);
 #endif
 
 extern ptl_err_t ptl_parse_routes (char *route_str);
index 1772cc2..439f902 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := kranal
-kranal-objs := ranal.o ranal_cb.o
+kranal-objs := ranal.o ranal_cb.o ranal_modparams.o
 
 EXTRA_POST_CFLAGS := @RACPPFLAGS@
 
index e48d5e2..a091b21 100644 (file)
@@ -22,8 +22,8 @@
  */
 #include "ranal.h"
 
-static int        kranal_devids[] = {RAPK_MAIN_DEVICE_ID,
-                                     RAPK_EXPANSION_DEVICE_ID};
+static int        kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID,
+                                                  RAPK_EXPANSION_DEVICE_ID};
 
 ptl_nal_t kranal_nal = {
         .nal_type       = RANAL,
@@ -37,190 +37,6 @@ ptl_nal_t kranal_nal = {
 };
 
 kra_data_t              kranal_data;
-kra_tunables_t          kranal_tunables;
-
-#define RANAL_SYSCTL_TIMEOUT           1
-#define RANAL_SYSCTL_LISTENER_TIMEOUT  2
-#define RANAL_SYSCTL_BACKLOG           3
-#define RANAL_SYSCTL_PORT              4
-#define RANAL_SYSCTL_MAX_IMMEDIATE     5
-
-#define RANAL_SYSCTL                   202
-
-static ctl_table kranal_ctl_table[] = {
-        {RANAL_SYSCTL_TIMEOUT, "timeout",
-         &kranal_tunables.kra_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {RANAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout",
-         &kranal_tunables.kra_listener_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {RANAL_SYSCTL_BACKLOG, "backlog",
-         &kranal_tunables.kra_backlog, sizeof(int),
-         0644, NULL, kranal_listener_procint},
-        {RANAL_SYSCTL_PORT, "port",
-         &kranal_tunables.kra_port, sizeof(int),
-         0644, NULL, kranal_listener_procint},
-        {RANAL_SYSCTL_MAX_IMMEDIATE, "max_immediate",
-         &kranal_tunables.kra_max_immediate, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
-};
-
-static ctl_table kranal_top_ctl_table[] = {
-        {RANAL_SYSCTL, "ranal", NULL, 0, 0555, kranal_ctl_table},
-        { 0 }
-};
-
-int
-kranal_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        struct iovec  iov = {
-                .iov_base = buffer,
-                .iov_len  = nob
-        };
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &iov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = MSG_DONTWAIT
-        };
-
-        /* We've set up the socket's send buffer to be large enough for
-         * everything we send, so a single non-blocking send should
-         * complete without error. */
-
-        set_fs(KERNEL_DS);
-        rc = sock_sendmsg(sock, &msg, iov.iov_len);
-        set_fs(oldmm);
-
-        if (rc == nob)
-                return 0;
-
-        if (rc >= 0)
-                return -EAGAIN;
-
-        return rc;
-}
-
-int
-kranal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
-{
-        int            rc;
-        mm_segment_t   oldmm = get_fs();
-        long           ticks = timeout * HZ;
-        unsigned long  then;
-        struct timeval tv;
-
-        LASSERT (nob > 0);
-        LASSERT (ticks > 0);
-
-        for (;;) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                /* Set receive timeout to remaining time */
-                tv = (struct timeval) {
-                        .tv_sec = ticks / HZ,
-                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
-                };
-                set_fs(KERNEL_DS);
-                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                     (char *)&tv, sizeof(tv));
-                set_fs(oldmm);
-                if (rc != 0) {
-                        CERROR("Can't set socket recv timeout %d: %d\n",
-                               timeout, rc);
-                        return rc;
-                }
-
-                set_fs(KERNEL_DS);
-                then = jiffies;
-                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
-                ticks -= jiffies - then;
-                set_fs(oldmm);
-
-                if (rc < 0)
-                        return rc;
-
-                if (rc == 0)
-                        return -ECONNABORTED;
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-
-                if (nob == 0)
-                        return 0;
-
-                if (ticks <= 0)
-                        return -ETIMEDOUT;
-        }
-}
-
-int
-kranal_create_sock(struct socket **sockp)
-{
-        struct socket       *sock;
-        int                  rc;
-        int                  option;
-        mm_segment_t         oldmm = get_fs();
-
-        rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR("Can't create socket: %d\n", rc);
-                return rc;
-        }
-
-        /* Ensure sending connection info doesn't block */
-        option = 2 * sizeof(kra_connreq_t);
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set send buffer %d: %d\n", option, rc);
-                goto failed;
-        }
-
-        option = 1;
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR: %d\n", rc);
-                goto failed;
-        }
-
-        *sockp = sock;
-        return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-void
-kranal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
 
 void
 kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid)
@@ -247,7 +63,7 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout)
 {
         int         rc;
 
-        rc = kranal_sock_read(sock, connreq, sizeof(*connreq), timeout);
+        rc = libcfs_sock_read(sock, connreq, sizeof(*connreq), timeout);
         if (rc != 0) {
                 CERROR("Read failed: %d\n", rc);
                 return rc;
@@ -431,7 +247,7 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev)
         kranal_set_conn_uniqueness(conn);
 
         conn->rac_device = dev;
-        conn->rac_timeout = MAX(kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT);
+        conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT);
         kranal_update_reaper_timeout(conn->rac_timeout);
 
         rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid,
@@ -616,7 +432,7 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
         }
 
         rc = kranal_recv_connreq(sock, &rx_connreq,
-                                 kranal_tunables.kra_listener_timeout);
+                                 *kranal_tunables.kra_listener_timeout);
         if (rc != 0) {
                 CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer_ip), peer_port, rc);
@@ -640,7 +456,7 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
 
         kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid);
 
-        rc = kranal_sock_write(sock, &tx_connreq, sizeof(tx_connreq));
+        rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), 0);
         if (rc != 0) {
                 CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer_ip), peer_port, rc);
@@ -663,63 +479,32 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
 int
 ranal_connect_sock(kra_peer_t *peer, struct socket **sockp)
 {
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
         unsigned int        port;
+        int                 fatal;
         int                 rc;
 
         for (port = 1023; port >= 512; port--) {
 
-                memset(&locaddr, 0, sizeof(locaddr));
-                locaddr.sin_family      = AF_INET;
-                locaddr.sin_port        = htons(port);
-                locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-
-                memset (&srvaddr, 0, sizeof (srvaddr));
-                srvaddr.sin_family      = AF_INET;
-                srvaddr.sin_port        = htons (peer->rap_port);
-                srvaddr.sin_addr.s_addr = htonl (peer->rap_ip);
-
-                rc = kranal_create_sock(&sock);
-                if (rc != 0)
-                        return rc;
-
-                rc = sock->ops->bind(sock,
-                                     (struct sockaddr *)&locaddr, sizeof(locaddr));
-                if (rc != 0) {
-                        sock_release(sock);
-
-                        if (rc == -EADDRINUSE) {
-                                CDEBUG(D_NET, "Port %d already in use\n", port);
-                                continue;
-                        }
-
-                        CERROR("Can't bind to reserved port %d: %d\n", port, rc);
-                        return rc;
-                }
-
-                rc = sock->ops->connect(sock,
-                                        (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                        0);
-                if (rc == 0) {
-                        *sockp = sock;
+                rc = libcfs_sock_connect(sockp, &fatal,
+                                         2 * sizeof(kra_conn_t),
+                                         0, port, 
+                                         peer->rap_ip, peer->rap_port);
+                if (rc == 0)
                         return 0;
+                
+                if (!fatal) {
+                        CDEBUG(D_NET, "Port %d already in use\n", port);
+                        continue;
                 }
 
-                sock_release(sock);
-
-                if (rc != -EADDRNOTAVAIL) {
-                        CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
-                               port, HIPQUAD(peer->rap_ip), peer->rap_port, rc);
-                        return rc;
-                }
-
-                CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n",
-                       port, HIPQUAD(peer->rap_ip), peer->rap_port);
+                CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
+                       port, HIPQUAD(peer->rap_ip), peer->rap_port, rc);
+                return rc;
         }
 
         /* all ports busy */
+        CERROR("Can't connect to %u.%u.%u.%u/%d: all ports busy\n",
+               HIPQUAD(peer->rap_ip), peer->rap_port);
         return -EHOSTUNREACH;
 }
 
@@ -754,21 +539,21 @@ kranal_active_conn_handshake(kra_peer_t *peer,
          * immediately after accepting a connection, so we connect and then
          * send immediately. */
 
-        rc = kranal_sock_write(sock, &connreq, sizeof(connreq));
+        rc = libcfs_sock_write(sock, &connreq, sizeof(connreq), 0);
         if (rc != 0) {
                 CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port, rc);
                 goto failed_1;
         }
 
-        rc = kranal_recv_connreq(sock, &connreq, kranal_tunables.kra_timeout);
+        rc = kranal_recv_connreq(sock, &connreq, *kranal_tunables.kra_timeout);
         if (rc != 0) {
                 CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port, rc);
                 goto failed_1;
         }
 
-        sock_release(sock);
+        libcfs_sock_release(sock);
         rc = -EPROTO;
 
         if (connreq.racr_srcnid != peer->rap_nid) {
@@ -797,7 +582,7 @@ kranal_active_conn_handshake(kra_peer_t *peer,
         return 0;
 
  failed_1:
-        sock_release(sock);
+        libcfs_sock_release(sock);
  failed_0:
         kranal_conn_decref(conn);
         return rc;
@@ -846,9 +631,10 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
                         return rc;
 
                 /* assume this is a new peer */
-                peer = kranal_create_peer(peer_nid);
-                if (peer == NULL) {
-                        CERROR("Can't allocate peer for "LPX64"\n", peer_nid);
+                rc = kranal_create_peer(&peer, peer_nid);
+                if (rc != 0) {
+                        CERROR("Can't create conn for %s\n", 
+                               libcfs_nid2str(peer_nid));
                         kranal_conn_decref(conn);
                         return -ENOMEM;
                 }
@@ -971,7 +757,8 @@ kranal_connect (kra_peer_t *peer)
                 LASSERT (list_empty(&peer->rap_tx_queue));
 
                 /* reset reconnection timeouts */
-                peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL;
+                peer->rap_reconnect_interval = 
+                        *kranal_tunables.kra_min_reconnect_interval;
                 peer->rap_reconnect_time = CURRENT_SECONDS;
 
                 write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
@@ -980,8 +767,9 @@ kranal_connect (kra_peer_t *peer)
 
         LASSERT (peer->rap_reconnect_interval != 0);
         peer->rap_reconnect_time = CURRENT_SECONDS + peer->rap_reconnect_interval;
-        peer->rap_reconnect_interval = MAX(RANAL_MAX_RECONNECT_INTERVAL,
-                                           1 * peer->rap_reconnect_interval);
+        peer->rap_reconnect_interval = 
+                MAX(*kranal_tunables.kra_max_reconnect_interval,
+                    2 * peer->rap_reconnect_interval);
 
         /* Grab all blocked packets while we have the global lock */
         list_add(&zombies, &peer->rap_tx_queue);
@@ -1007,14 +795,13 @@ kranal_connect (kra_peer_t *peer)
 void
 kranal_free_acceptsock (kra_acceptsock_t *ras)
 {
-        sock_release(ras->ras_sock);
+        libcfs_sock_release(ras->ras_sock);
         PORTAL_FREE(ras, sizeof(*ras));
 }
 
 int
 kranal_listener (void *arg)
 {
-        struct sockaddr_in addr;
         wait_queue_t       wait;
         struct socket     *sock;
         kra_acceptsock_t  *ras;
@@ -1023,37 +810,19 @@ kranal_listener (void *arg)
         int                rc;
         unsigned long      flags;
 
-        /* Parent thread holds kra_nid_mutex, and is, or is about to
-         * block on kra_listener_signal */
+        /* Parent thread is blocked or about to block on kra_listener_signal */
 
-        port = kranal_tunables.kra_port;
+        port = *kranal_tunables.kra_port;
         snprintf(name, sizeof(name), "kranal_lstn%03d", port);
         kportal_daemonize(name);
         kportal_blockallsigs();
 
         init_waitqueue_entry(&wait, current);
 
-        rc = kranal_create_sock(&sock);
+        rc = libcfs_sock_listen(&sock, 0, port,
+                                *kranal_tunables.kra_backlog);
         if (rc != 0)
-                goto out_0;
-
-        memset(&addr, 0, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_port        = htons(port);
-        addr.sin_addr.s_addr = INADDR_ANY;
-
-        rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
-        if (rc != 0) {
-                CERROR("Can't bind to port %d\n", port);
-                goto out_1;
-        }
-
-        rc = sock->ops->listen(sock, kranal_tunables.kra_backlog);
-        if (rc != 0) {
-                CERROR("Can't set listen backlog %d: %d\n",
-                       kranal_tunables.kra_backlog, rc);
-                goto out_1;
-        }
+                goto out;
 
         LASSERT (kranal_data.kra_listener_sock == NULL);
         kranal_data.kra_listener_sock = sock;
@@ -1071,68 +840,39 @@ kranal_listener (void *arg)
                 if (ras == NULL) {
                         PORTAL_ALLOC(ras, sizeof(*ras));
                         if (ras == NULL) {
-                                CERROR("Out of Memory: pausing...\n");
-                                kranal_pause(HZ);
+                                CERROR("ENOMEM: listener pausing\n");
+                                libcfs_pause(cfs_time_seconds(1));
                                 continue;
                         }
-                        ras->ras_sock = NULL;
                 }
 
-                if (ras->ras_sock == NULL) {
-                        ras->ras_sock = sock_alloc();
-                        if (ras->ras_sock == NULL) {
-                                CERROR("Can't allocate socket: pausing...\n");
-                                kranal_pause(HZ);
-                                continue;
+                rc = libcfs_sock_accept(&ras->ras_sock, sock,
+                                        2 * sizeof(kra_conn_t));
+                if (rc != 0) {
+                        if (rc != -EAGAIN) {
+                                CWARN("Accept error: %d, listener pausing\n", rc);
+                                libcfs_pause(cfs_time_seconds(1));
                         }
-                        /* XXX this should add a ref to sock->ops->owner, if
-                         * TCP could be a module */
-                        ras->ras_sock->type = sock->type;
-                        ras->ras_sock->ops = sock->ops;
+                        continue;
                 }
 
-                set_current_state(TASK_INTERRUPTIBLE);
-
-                rc = sock->ops->accept(sock, ras->ras_sock, O_NONBLOCK);
+                spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
 
-                /* Sleep for socket activity? */
-                if (rc == -EAGAIN &&
-                    kranal_data.kra_listener_shutdown == 0)
-                        schedule();
+                list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq);
 
-                set_current_state(TASK_RUNNING);
-
-                if (rc == 0) {
-                        spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
-
-                        list_add_tail(&ras->ras_list,
-                                      &kranal_data.kra_connd_acceptq);
-
-                        spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-                        wake_up(&kranal_data.kra_connd_waitq);
-
-                        ras = NULL;
-                        continue;
-                }
+                spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
+                wake_up(&kranal_data.kra_connd_waitq);
 
-                if (rc != -EAGAIN) {
-                        CERROR("Accept failed: %d, pausing...\n", rc);
-                        kranal_pause(HZ);
-                }
+                ras = NULL;
         }
 
-        if (ras != NULL) {
-                if (ras->ras_sock != NULL)
-                        sock_release(ras->ras_sock);
+        if (ras != NULL)
                 PORTAL_FREE(ras, sizeof(*ras));
-        }
 
         rc = 0;
-        remove_wait_queue(sock->sk->sk_sleep, &wait);
- out_1:
-        sock_release(sock);
+        libcfs_sock_release(sock);
         kranal_data.kra_listener_sock = NULL;
- out_0:
+ out:
         /* set completion status and unblock thread waiting for me
          * (parent on startup failure, executioner on normal shutdown) */
         kranal_data.kra_listener_shutdown = rc;
@@ -1149,7 +889,6 @@ kranal_start_listener (void)
 
         CDEBUG(D_NET, "Starting listener\n");
 
-        /* Called holding kra_nid_mutex: listener stopped */
         LASSERT (kranal_data.kra_listener_sock == NULL);
 
         kranal_data.kra_listener_shutdown = 0;
@@ -1170,136 +909,33 @@ kranal_start_listener (void)
 }
 
 void
-kranal_stop_listener(int clear_acceptq)
+kranal_stop_listener(void)
 {
-        struct list_head  zombie_accepts;
-        unsigned long     flags;
-        kra_acceptsock_t *ras;
-
         CDEBUG(D_NET, "Stopping listener\n");
 
-        /* Called holding kra_nid_mutex: listener running */
         LASSERT (kranal_data.kra_listener_sock != NULL);
 
         kranal_data.kra_listener_shutdown = 1;
-        wake_up_all(kranal_data.kra_listener_sock->sk->sk_sleep);
+        libcfs_sock_abort_accept(kranal_data.kra_listener_sock);
 
         /* Block until listener has torn down. */
         down(&kranal_data.kra_listener_signal);
 
         LASSERT (kranal_data.kra_listener_sock == NULL);
         CDEBUG(D_NET, "Listener stopped\n");
-
-        if (!clear_acceptq)
-                return;
-
-        /* Close any unhandled accepts */
-        spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
-
-        list_add(&zombie_accepts, &kranal_data.kra_connd_acceptq);
-        list_del_init(&kranal_data.kra_connd_acceptq);
-
-        spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
-        while (!list_empty(&zombie_accepts)) {
-                ras = list_entry(zombie_accepts.next,
-                                 kra_acceptsock_t, ras_list);
-                list_del(&ras->ras_list);
-                kranal_free_acceptsock(ras);
-        }
 }
 
 int
-kranal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp)
+kranal_create_peer (kra_peer_t **peerp, ptl_nid_t nid)
 {
-        int   *tunable = (int *)table->data;
-        int    old_val;
-        int    rc;
-
-        /* No race with nal initialisation since the nal is setup all the time
-         * it's loaded.  When that changes, change this! */
-        LASSERT (kranal_data.kra_init == RANAL_INIT_ALL);
-
-        down(&kranal_data.kra_nid_mutex);
-
-        LASSERT (tunable == &kranal_tunables.kra_port ||
-                 tunable == &kranal_tunables.kra_backlog);
-        old_val = *tunable;
-
-        rc = proc_dointvec(table, write, filp, buffer, lenp);
-
-        if (write &&
-            (*tunable != old_val ||
-             kranal_data.kra_listener_sock == NULL)) {
-
-                if (kranal_data.kra_listener_sock != NULL)
-                        kranal_stop_listener(0);
-
-                rc = kranal_start_listener();
-
-                if (rc != 0) {
-                        CWARN("Unable to start listener with new tunable:"
-                              " reverting to old value\n");
-                        *tunable = old_val;
-                        kranal_start_listener();
-                }
-        }
-
-        up(&kranal_data.kra_nid_mutex);
-
-        LASSERT (kranal_data.kra_init == RANAL_INIT_ALL);
-        return rc;
-}
-
-int
-kranal_set_mynid(ptl_nid_t nid)
-{
-        unsigned long    flags;
-        ptl_ni_t        *ni = kranal_data.kra_ni;
-        int              rc = 0;
-
-        CDEBUG(D_NET, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_nid);
-
-        down(&kranal_data.kra_nid_mutex);
-
-        if (nid == ni->ni_nid) {
-                /* no change of NID */
-                up(&kranal_data.kra_nid_mutex);
-                return 0;
-        }
-
-        if (kranal_data.kra_listener_sock != NULL)
-                kranal_stop_listener(1);
-
-        write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-        kranal_data.kra_peerstamp++;
-        ni->ni_nid = nid;
-        write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
-        /* Delete all existing peers and their connections after new
-         * NID/connstamp set to ensure no old connections in our brave
-         * new world. */
-        kranal_del_peer(PTL_NID_ANY);
-
-        if (nid != PTL_NID_ANY)
-                rc = kranal_start_listener();
-
-        up(&kranal_data.kra_nid_mutex);
-        return rc;
-}
-
-kra_peer_t *
-kranal_create_peer (ptl_nid_t nid)
-{
-        kra_peer_t *peer;
+        kra_peer_t    *peer;
+        unsigned long  flags;
 
         LASSERT (nid != PTL_NID_ANY);
 
         PORTAL_ALLOC(peer, sizeof(*peer));
         if (peer == NULL)
-                return NULL;
+                return -ENOMEM;
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -1312,15 +948,33 @@ kranal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD(&peer->rap_tx_queue);
 
         peer->rap_reconnect_time = CURRENT_SECONDS;
-        peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL;
+        peer->rap_reconnect_interval = 
+                *kranal_tunables.kra_min_reconnect_interval;
 
-        atomic_inc(&kranal_data.kra_npeers);
-        return peer;
+        write_lock_irqsave(&kranal_data.kra_global_lock, flags);
+
+        if (kranal_data.kra_listener_shutdown) {
+                /* shutdown has started already */
+                write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
+                
+                PORTAL_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
+
+        kranal_data.kra_npeers++;
+
+        write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
+
+        *peerp = peer;
+        return 0;
 }
 
 void
 kranal_destroy_peer (kra_peer_t *peer)
 {
+        unsigned long flags;
+
         CDEBUG(D_NET, "peer "LPX64" %p deleted\n", peer->rap_nid, peer);
 
         LASSERT (atomic_read(&peer->rap_refcount) == 0);
@@ -1337,7 +991,9 @@ kranal_destroy_peer (kra_peer_t *peer)
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec(&kranal_data.kra_npeers);
+        write_lock_irqsave(&kranal_data.kra_global_lock, flags);
+        kranal_data.kra_npeers--;
+        write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
 }
 
 kra_peer_t *
@@ -1432,13 +1088,14 @@ kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
         unsigned long      flags;
         kra_peer_t        *peer;
         kra_peer_t        *peer2;
+        int                rc;
 
         if (nid == PTL_NID_ANY)
                 return -EINVAL;
 
-        peer = kranal_create_peer(nid);
-        if (peer == NULL)
-                return -ENOMEM;
+        rc = kranal_create_peer(&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
@@ -1671,10 +1328,15 @@ kranal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_PORTAL_REGISTER_MYNID: {
-                if (data->ioc_nid == PTL_NID_ANY)
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kranal_set_mynid(data->ioc_nid);
+                }
                 break;
         }
         }
@@ -1741,7 +1403,8 @@ kranal_alloc_txdescs(struct list_head *freelist, int n)
 int
 kranal_device_init(int id, kra_device_t *dev)
 {
-        const int         total_ntx = RANAL_NTX + RANAL_NTX_NBLK;
+        int               total_ntx = *kranal_tunables.kra_ntx + 
+                                      *kranal_tunables.kra_ntx_nblk;
         RAP_RETURN        rrc;
 
         dev->rad_id = id;
@@ -1762,16 +1425,17 @@ kranal_device_init(int id, kra_device_t *dev)
         rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND,
                            &dev->rad_rdma_cqh);
         if (rrc != RAP_SUCCESS) {
-                CERROR("Can't create rdma cq size %d"
-                       " for device %d: %d\n", total_ntx, id, rrc);
+                CERROR("Can't create rdma cq size %d for device %d: %d\n",
+                       total_ntx, id, rrc);
                 goto failed_1;
         }
 
-        rrc = RapkCreateCQ(dev->rad_handle, RANAL_FMA_CQ_SIZE, RAP_CQTYPE_RECV,
-                           &dev->rad_fma_cqh);
+        rrc = RapkCreateCQ(dev->rad_handle, 
+                           *kranal_tunables.kra_fma_cq_size, 
+                           RAP_CQTYPE_RECV, &dev->rad_fma_cqh);
         if (rrc != RAP_SUCCESS) {
-                CERROR("Can't create fma cq size %d"
-                       " for device %d: %d\n", RANAL_FMA_CQ_SIZE, id, rrc);
+                CERROR("Can't create fma cq size %d for device %d: %d\n", 
+                       *kranal_tunables.kra_fma_cq_size, id, rrc);
                 goto failed_2;
         }
 
@@ -1788,6 +1452,13 @@ kranal_device_init(int id, kra_device_t *dev)
 void
 kranal_device_fini(kra_device_t *dev)
 {
+        LASSERT (list_empty(&dev->rad_ready_conns));
+        LASSERT (list_empty(&dev->rad_new_conns));
+        LASSERT (dev->rad_nphysmap == 0);
+        LASSERT (dev->rad_nppphysmap == 0);
+        LASSERT (dev->rad_nvirtmap == 0);
+        LASSERT (dev->rad_nobvirtmap == 0);
+                
         LASSERT(dev->rad_scheduler == NULL);
         RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh);
         RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh);
@@ -1812,42 +1483,62 @@ kranal_shutdown (ptl_ni_t *ni)
                 LBUG();
 
         case RANAL_INIT_ALL:
-                /* resetting my NID to unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kranal_set_mynid(PTL_NID_ANY);
-                /* no new peers or conns */
+                /* Stop listener and prevent new peers from being created */
+                kranal_stop_listener();
 
-                /* Wait for all peer/conn state to clean up */
+                /* Remove all existing peers from the peer table */
+                kranal_del_peer(PTL_NID_ANY);
+
+                /* Wait for pending conn reqs to be handled */
                 i = 2;
-                while (atomic_read(&kranal_data.kra_nconns) != 0 ||
-                       atomic_read(&kranal_data.kra_npeers) != 0) {
+                spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
+                while (!list_empty(&kranal_data.kra_connd_acceptq)) {
+                        spin_unlock_irqrestore(&kranal_data.kra_connd_lock, 
+                                               flags);
                         i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers and %d conns to close down\n",
-                               atomic_read(&kranal_data.kra_npeers),
-                               atomic_read(&kranal_data.kra_nconns));
-                        kranal_pause(HZ);
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for conn reqs to clean up\n");
+                        libcfs_pause(cfs_time_seconds(1));
+
+                        spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
                 }
+                spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
+
+                /* Wait for all peers to be freed */
+                i = 2;
+                write_lock_irqsave(&kranal_data.kra_global_lock, flags);
+                while (kranal_data.kra_npeers != 0) {
+                        write_unlock_irqrestore(&kranal_data.kra_global_lock,
+                                                flags);
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for %d peers to close down\n",
+                               kranal_data.kra_npeers);
+                        libcfs_pause(cfs_time_seconds(1));
+
+                        write_lock_irqsave(&kranal_data.kra_global_lock, 
+                                           flags);
+                }
+                write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
                 /* fall through */
 
         case RANAL_INIT_DATA:
                 break;
         }
 
-        /* Conn/Peer state all cleaned up BEFORE setting shutdown, so threads
-         * don't have to worry about shutdown races */
-        LASSERT (atomic_read(&kranal_data.kra_nconns) == 0);
-        LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
+        /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+         * have to worry about shutdown races.  NB connections may be created
+         * while there are still active connds, but these will be temporary
+         * since peer creation always fails after the listener has started to
+         * shut down. */
+        LASSERT (kranal_data.kra_npeers == 0);
         
-        /* flag threads to terminate; wake and wait for them to die */
+        /* Flag threads to terminate */
         kranal_data.kra_shutdown = 1;
 
         for (i = 0; i < kranal_data.kra_ndevs; i++) {
                 kra_device_t *dev = &kranal_data.kra_devices[i];
 
-                LASSERT (list_empty(&dev->rad_ready_conns));
-                LASSERT (list_empty(&dev->rad_new_conns));
-
                 spin_lock_irqsave(&dev->rad_lock, flags);
                 wake_up(&dev->rad_waitq);
                 spin_unlock_irqrestore(&dev->rad_lock, flags);
@@ -1862,16 +1553,17 @@ kranal_shutdown (ptl_ni_t *ni)
         wake_up_all(&kranal_data.kra_connd_waitq);
         spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
 
+        /* Wait for threads to exit */
         i = 2;
         while (atomic_read(&kranal_data.kra_nthreads) != 0) {
                 i++;
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                        "Waiting for %d threads to terminate\n",
                        atomic_read(&kranal_data.kra_nthreads));
-                kranal_pause(HZ);
+                libcfs_pause(cfs_time_seconds(1));
         }
 
-        LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
+        LASSERT (kranal_data.kra_npeers == 0);
         if (kranal_data.kra_peers != NULL) {
                 for (i = 0; i < kranal_data.kra_peer_hash_size; i++)
                         LASSERT (list_empty(&kranal_data.kra_peers[i]));
@@ -1922,6 +1614,11 @@ kranal_startup (ptl_ni_t *ni)
                 CERROR ("Only 1 instance supported\n");
                 return PTL_FAIL;
         }
+
+        if (ptl_set_ip_niaddr(ni) != PTL_OK) {
+                CERROR ("Can't determine my NID\n");
+                return PTL_FAIL;
+        }
         
         memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */
 
@@ -1938,7 +1635,6 @@ kranal_startup (ptl_ni_t *ni)
         kranal_data.kra_connstamp =
         kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        init_MUTEX(&kranal_data.kra_nid_mutex);
         init_MUTEX_LOCKED(&kranal_data.kra_listener_signal);
 
         rwlock_init(&kranal_data.kra_global_lock);
@@ -1989,11 +1685,13 @@ kranal_startup (ptl_ni_t *ni)
         for (i = 0; i < kranal_data.kra_conn_hash_size; i++)
                 INIT_LIST_HEAD(&kranal_data.kra_conns[i]);
 
-        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, RANAL_NTX);
+        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, 
+                                  *kranal_tunables.kra_ntx);
         if (rc != 0)
                 goto failed;
 
-        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,RANAL_NTX_NBLK);
+        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,
+                                  *kranal_tunables.kra_ntx_nblk);
         if (rc != 0)
                 goto failed;
 
@@ -2003,7 +1701,7 @@ kranal_startup (ptl_ni_t *ni)
                 goto failed;
         }
 
-        for (i = 0; i < RANAL_N_CONND; i++) {
+        for (i = 0; i < *kranal_tunables.kra_n_connd; i++) {
                 rc = kranal_thread_start(kranal_connd, (void *)(unsigned long)i);
                 if (rc != 0) {
                         CERROR("Can't spawn ranal connd[%d]: %d\n",
@@ -2014,51 +1712,18 @@ kranal_startup (ptl_ni_t *ni)
 
         LASSERT (kranal_data.kra_ndevs == 0);
 
-        if (ni->ni_interfaces[0] == NULL) {
-                /* Use all available RapidArray devices */
-                for (i = 0; i < sizeof(kranal_devids)/sizeof(kranal_devids[0]); i++) {
-                        LASSERT (i < RANAL_MAXDEVS);
-
-                        dev = &kranal_data.kra_devices[kranal_data.kra_ndevs];
+        /* Use all available RapidArray devices */
+        for (i = 0; i < RANAL_MAXDEVS; i++) {
+                dev = &kranal_data.kra_devices[kranal_data.kra_ndevs];
 
-                        rc = kranal_device_init(kranal_devids[i], dev);
-                        if (rc == 0)
-                                kranal_data.kra_ndevs++;
-                }
-
-                if (kranal_data.kra_ndevs == 0) {
-                        CERROR("Can't initialise any RapidArray devices\n");
-                        goto failed;
-                }
-        } else {
-                /* Use specified RapidArray devices */
-                for (i = 0; i < PTL_MAX_INTERFACES; i++) {
-                        int   devid;
-                        int   len;
-
-                        if (kranal_data.kra_ndevs == RANAL_MAXDEVS) {
-                                CERROR("Too many interfaces\n");
-                                goto failed;
-                        }
-
-                        dev = &kranal_data.kra_devices[kranal_data.kra_ndevs];
-
-                        if (sscanf(ni->ni_interfaces[i], "%d%n", &devid, &len) < 1 ||
-                            len != strlen(ni->ni_interfaces[i])) {
-                                CERROR("Can't parse interface '%s'\n", 
-                                       ni->ni_interfaces[i]);
-                                goto failed;
-                        }
-                        
-                        rc = kranal_device_init(devid, dev);
-                        if (rc != 0) {
-                                CERROR("Can't open interface '%s': %d\n",
-                                       ni->ni_interfaces[i], rc);
-                                goto failed;
-                        }
-                        
+                rc = kranal_device_init(kranal_devids[i], dev);
+                if (rc == 0)
                         kranal_data.kra_ndevs++;
-                }
+        }
+
+        if (kranal_data.kra_ndevs == 0) {
+                CERROR("Can't initialise any RapidArray devices\n");
+                goto failed;
         }
         
         for (i = 0; i < kranal_data.kra_ndevs; i++) {
@@ -2071,11 +1736,15 @@ kranal_startup (ptl_ni_t *ni)
                 }
         }
 
+        rc = kranal_start_listener();
+        if (rc != 0)
+                goto failed;
+
         /* flag everything initialised */
         kranal_data.kra_init = RANAL_INIT_ALL;
         /*****************************************************/
-        
-        CDEBUG(D_MALLOC, "initial kmem %d\n", atomic_read(&portal_kmemory));
+
+        CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
         printk(KERN_INFO "Lustre: RapidArray NAL loaded "
                "(initial mem %d)\n", pkmem);
 
@@ -2089,10 +1758,8 @@ kranal_startup (ptl_ni_t *ni)
 void __exit
 kranal_module_fini (void)
 {
-        if (kranal_tunables.kra_sysctl != NULL)
-                unregister_sysctl_table(kranal_tunables.kra_sysctl);
-
         ptl_unregister_nal(&kranal_nal);
+        kranal_tunables_fini();
 }
 
 int __init
@@ -2100,31 +1767,12 @@ kranal_module_init (void)
 {
         int    rc;
 
-        /* the following must be sizeof(int) for
-         * proc_dointvec/kranal_listener_procint() */
-        CLASSERT (sizeof(kranal_tunables.kra_timeout) == sizeof(int));
-        CLASSERT (sizeof(kranal_tunables.kra_listener_timeout) == sizeof(int));
-        CLASSERT (sizeof(kranal_tunables.kra_backlog) == sizeof(int));
-        CLASSERT (sizeof(kranal_tunables.kra_port) == sizeof(int));
-        CLASSERT (sizeof(kranal_tunables.kra_max_immediate) == sizeof(int));
-
-        /* Initialise dynamic tunables to defaults once only */
-        kranal_tunables.kra_timeout = RANAL_TIMEOUT;
-        kranal_tunables.kra_listener_timeout = RANAL_LISTENER_TIMEOUT;
-        kranal_tunables.kra_backlog = RANAL_BACKLOG;
-        kranal_tunables.kra_port = RANAL_PORT;
-        kranal_tunables.kra_max_immediate = RANAL_MAX_IMMEDIATE;
+        rc = kranal_tunables_init();
+        if (rc != 0)
+                return rc;
 
         ptl_register_nal(&kranal_nal);
 
-        kranal_tunables.kra_sysctl =
-                register_sysctl_table(kranal_top_ctl_table, 0);
-        if (kranal_tunables.kra_sysctl == NULL) {
-                CERROR("Can't register sysctl table\n");
-                ptl_unregister_nal(&kranal_nal);
-                return -ENOMEM;
-        }
-
         return 0;
 }
 
index 71cd666..06ce4c8 100644 (file)
 
 #include <rapl.h>
 
-#define RANAL_MAXDEVS       2                   /* max # devices RapidArray supports */
+/* default vals for modparams/tunables */
+#define RANAL_N_CONND             4             /* # connection daemons */
 
-#define RANAL_N_CONND       4                   /* # connection daemons */
-
-#define RANAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry (seconds)... */
+#define RANAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry... */
 #define RANAL_MAX_RECONNECT_INTERVAL 60         /* ...exponentially increasing to this */
 
-#define RANAL_FMA_MAX_PREFIX      232           /* max size of FMA "Prefix" */
-#define RANAL_FMA_MAX_DATA        ((7<<10)-256) /* Max FMA MSG is 7K including prefix */
-
-#define RANAL_PEER_HASH_SIZE  101               /* # peer lists */
-#define RANAL_CONN_HASH_SIZE  101               /* # conn lists */
+#define RANAL_NTX                 64            /* # tx descs */
+#define RANAL_NTX_NBLK            256           /* # reserved tx descs */
 
-#define RANAL_NTX             64                /* # tx descs */
-#define RANAL_NTX_NBLK        256               /* # reserved tx descs */
-
-#define RANAL_FMA_CQ_SIZE     8192              /* # entries in receive CQ
+#define RANAL_FMA_CQ_SIZE         8192          /* # entries in receive CQ
                                                  * (overflow is a performance hit) */
+#define RANAL_TIMEOUT             30            /* comms timeout (seconds) */
+#define RANAL_LISTENER_TIMEOUT    5             /* listener timeout (seconds) */
+#define RANAL_BACKLOG             127           /* listener's backlog */
+#define RANAL_PORT                987           /* listener's port */
+#define RANAL_MAX_IMMEDIATE      (2<<10)        /* immediate payload breakpoint */
+
+/* tunables determined at compile time */
+#define RANAL_RESCHED             100           /* # scheduler loops before reschedule */
 
-#define RANAL_RESCHED         100               /* # scheduler loops before reschedule */
+#define RANAL_PEER_HASH_SIZE      101           /* # peer lists */
+#define RANAL_CONN_HASH_SIZE      101           /* # conn lists */
 
-#define RANAL_MIN_TIMEOUT     5                 /* minimum timeout interval (seconds) */
+#define RANAL_MIN_TIMEOUT         5             /* minimum timeout interval (seconds) */
 #define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2)  /* timeout -> keepalive interval */
 
-/* default vals for runtime tunables */
-#define RANAL_TIMEOUT           30              /* comms timeout (seconds) */
-#define RANAL_LISTENER_TIMEOUT   5              /* listener timeout (seconds) */
-#define RANAL_BACKLOG          127              /* listener's backlog */
-#define RANAL_PORT             988              /* listener's port */
-#define RANAL_MAX_IMMEDIATE    (2<<10)          /* immediate payload breakpoint */
+/* fixed constants */
+#define RANAL_MAXDEVS             2             /* max # devices RapidArray supports */
+#define RANAL_FMA_MAX_PREFIX      232           /* max bytes in FMA "Prefix" we can use */
+#define RANAL_FMA_MAX_DATA        ((7<<10)-256) /* Max FMA MSG is 7K including prefix */
+
 
 typedef struct
 {
-        int               kra_timeout;          /* comms timeout (seconds) */
-        int               kra_listener_timeout; /* max time the listener can block */
-        int               kra_backlog;          /* listener's backlog */
-        int               kra_port;             /* listener's TCP/IP port */
-        int               kra_max_immediate;    /* immediate payload breakpoint */
-
+        int              *kra_n_connd;          /* # connection daemons */
+        int              *kra_min_reconnect_interval; /* first failed connection retry... */
+        int              *kra_max_reconnect_interval; /* ...exponentially increasing to this */
+        int              *kra_ntx;              /* # tx descs */
+        int              *kra_ntx_nblk;         /* # reserved tx descs */
+        int              *kra_fma_cq_size;      /* # entries in receive CQ */
+        int              *kra_timeout;          /* comms timeout (seconds) */
+        int              *kra_listener_timeout; /* max time the listener can block */
+        int              *kra_backlog;          /* listener's backlog */
+        int              *kra_port;             /* listener's TCP/IP port */
+        int              *kra_max_immediate;    /* immediate payload breakpoint */
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kra_sysctl;    /* sysctl interface */
+#endif
 } kra_tunables_t;
 
 typedef struct
@@ -114,6 +123,10 @@ typedef struct
         wait_queue_head_t       rad_waitq;      /* scheduler waits here */
         spinlock_t              rad_lock;       /* serialise */
         void                   *rad_scheduler;  /* scheduling thread */
+        unsigned int            rad_nphysmap;   /* # phys mappings */
+        unsigned int            rad_nppphysmap; /* # phys pages mapped */
+        unsigned int            rad_nvirtmap;   /* # virt mappings */
+        unsigned long           rad_nobvirtmap; /* # virt bytes mapped */
 } kra_device_t;
 
 typedef struct
@@ -123,7 +136,6 @@ typedef struct
         atomic_t          kra_nthreads;         /* # live threads */
         ptl_ni_t         *kra_ni;               /* _the_ nal instance */
         
-        struct semaphore  kra_nid_mutex;        /* serialise NID/listener ops */
         struct semaphore  kra_listener_signal;  /* block for listener startup/shutdown */
         struct socket    *kra_listener_sock;    /* listener's socket */
         int               kra_listener_shutdown; /* ask listener to close */
@@ -135,7 +147,7 @@ typedef struct
 
         struct list_head *kra_peers;            /* hash table of all my known peers */
         int               kra_peer_hash_size;   /* size of kra_peers */
-        atomic_t          kra_npeers;           /* # peers extant */
+        int               kra_npeers;           /* # peers extant */
 
         struct list_head *kra_conns;            /* conns hashed by cqid */
         int               kra_conn_hash_size;   /* size of kra_conns */
@@ -475,7 +487,8 @@ extern void kranal_update_reaper_timeout (long timeout);
 extern void kranal_tx_done (kra_tx_t *tx, int completion);
 extern void kranal_unlink_peer_locked (kra_peer_t *peer);
 extern void kranal_schedule_conn (kra_conn_t *conn);
-extern kra_peer_t *kranal_create_peer (ptl_nid_t nid);
+extern int kranal_create_peer (kra_peer_t **peerp, ptl_nid_t nid);
+extern int kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port);
 extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid);
 extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx);
 extern int kranal_del_peer (ptl_nid_t nid);
@@ -488,4 +501,5 @@ extern void kranal_close_conn_locked (kra_conn_t *conn, int error);
 extern void kranal_terminate_conn_locked (kra_conn_t *conn);
 extern void kranal_connect (kra_peer_t *peer);
 extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer);
-extern void kranal_pause(int ticks);
+extern int kranal_tunables_init(void);
+extern void kranal_tunables_fini(void);
index 53dabb8..78993f7 100644 (file)
@@ -282,7 +282,7 @@ kranal_setup_rdma_buffer (kra_tx_t *tx, int niov,
         return kranal_setup_virt_buffer(tx, niov, iov, offset, nob);
 }
 
-void
+int
 kranal_map_buffer (kra_tx_t *tx)
 {
         kra_conn_t     *conn = tx->tx_conn;
@@ -299,23 +299,45 @@ kranal_map_buffer (kra_tx_t *tx)
         case RANAL_BUF_IMMEDIATE:
         case RANAL_BUF_PHYS_MAPPED:
         case RANAL_BUF_VIRT_MAPPED:
-                break;
+                return 0;
 
         case RANAL_BUF_PHYS_UNMAPPED:
                 rrc = RapkRegisterPhys(dev->rad_handle,
                                        tx->tx_phys, tx->tx_phys_npages,
                                        &tx->tx_map_key);
-                LASSERT (rrc == RAP_SUCCESS);
+                if (rrc != RAP_SUCCESS) {
+                        CERROR ("Can't map %d pages: dev %d "
+                                "phys %u pp %u, virt %u nob %lu\n",
+                                tx->tx_phys_npages, dev->rad_id, 
+                                dev->rad_nphysmap, dev->rad_nppphysmap,
+                                dev->rad_nvirtmap, dev->rad_nobvirtmap);
+                        return -ENOMEM; /* assume insufficient resources */
+                }
+
+                dev->rad_nphysmap++;
+                dev->rad_nppphysmap += tx->tx_phys_npages;
+
                 tx->tx_buftype = RANAL_BUF_PHYS_MAPPED;
-                break;
+                return 0;
 
         case RANAL_BUF_VIRT_UNMAPPED:
                 rrc = RapkRegisterMemory(dev->rad_handle,
                                          tx->tx_buffer, tx->tx_nob,
                                          &tx->tx_map_key);
-                LASSERT (rrc == RAP_SUCCESS);
+                if (rrc != RAP_SUCCESS) {
+                        CERROR ("Can't map %d bytes: dev %d "
+                                "phys %u pp %u, virt %u nob %lu\n",
+                                tx->tx_nob, dev->rad_id, 
+                                dev->rad_nphysmap, dev->rad_nppphysmap,
+                                dev->rad_nvirtmap, dev->rad_nobvirtmap);
+                        return -ENOMEM; /* assume insufficient resources */
+                }
+
+                dev->rad_nvirtmap++;
+                dev->rad_nobvirtmap += tx->tx_nob;
+
                 tx->tx_buftype = RANAL_BUF_VIRT_MAPPED;
-                break;
+                return 0;
         }
 }
 
@@ -342,6 +364,10 @@ kranal_unmap_buffer (kra_tx_t *tx)
                 rrc = RapkDeregisterMemory(dev->rad_handle, NULL,
                                            &tx->tx_map_key);
                 LASSERT (rrc == RAP_SUCCESS);
+
+                dev->rad_nphysmap--;
+                dev->rad_nppphysmap -= tx->tx_phys_npages;
+
                 tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED;
                 break;
 
@@ -352,6 +378,10 @@ kranal_unmap_buffer (kra_tx_t *tx)
                 rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer,
                                            &tx->tx_map_key);
                 LASSERT (rrc == RAP_SUCCESS);
+
+                dev->rad_nvirtmap--;
+                dev->rad_nobvirtmap -= tx->tx_nob;
+
                 tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED;
                 break;
         }
@@ -428,6 +458,8 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
         kra_peer_t      *peer;
         kra_conn_t      *conn;
         unsigned long    now;
+        int              rc;
+        int              retry;
         rwlock_t        *g_lock = &kranal_data.kra_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
@@ -435,33 +467,46 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
 
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
 
-        read_lock(g_lock);
+        for (retry = 0; ; retry = 1) {
 
-        peer = kranal_find_peer_locked(nid);
-        if (peer == NULL) {
-                read_unlock(g_lock);
-                kranal_tx_done(tx, -EHOSTUNREACH);
-                return;
-        }
+                read_lock(g_lock);
 
-        conn = kranal_find_conn_locked(peer);
-        if (conn != NULL) {
-                kranal_post_fma(conn, tx);
+                peer = kranal_find_peer_locked(nid);
+                if (peer != NULL) {
+                        conn = kranal_find_conn_locked(peer);
+                        if (conn != NULL) {
+                                kranal_post_fma(conn, tx);
+                                read_unlock(g_lock);
+                                return;
+                        }
+                }
+                
+                /* Making connections; I'll need a write lock... */
                 read_unlock(g_lock);
-                return;
-        }
+                write_lock_irqsave(g_lock, flags);
 
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock_irqsave(g_lock, flags);
-
-        peer = kranal_find_peer_locked(nid);
-        if (peer == NULL) {
+                peer = kranal_find_peer_locked(nid);
+                if (peer != NULL)
+                        break;
+                
                 write_unlock_irqrestore(g_lock, flags);
-                kranal_tx_done(tx, -EHOSTUNREACH);
-                return;
-        }
+                
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+                        kranal_tx_done(tx, -EHOSTUNREACH);
+                        return;
+                }
 
+                rc = kranal_add_persistent_peer(nid, PTL_NIDADDR(nid),
+                                                *kranal_tunables.kra_port);
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        kranal_tx_done(tx, rc);
+                        return;
+                }
+        }
+        
         conn = kranal_find_conn_locked(peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
@@ -469,7 +514,7 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
                 write_unlock_irqrestore(g_lock, flags);
                 return;
         }
-
+                        
         LASSERT (peer->rap_persistence > 0);
 
         if (!peer->rap_connecting) {
@@ -641,7 +686,12 @@ kranal_do_send (ptl_ni_t        *ni,
                 tx->tx_conn = conn;
                 tx->tx_ptlmsg[0] = ptlmsg;
 
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                if (rc != 0) {
+                        kranal_tx_done(tx, rc);
+                        return PTL_FAIL;
+                }
+
                 kranal_rdma(tx, RANAL_MSG_GET_DONE,
                             &conn->rac_rxmsg->ram_u.get.ragm_desc, nob,
                             conn->rac_rxmsg->ram_u.get.ragm_cookie);
@@ -662,7 +712,7 @@ kranal_do_send (ptl_ni_t        *ni,
 
                 if ((ptlmsg->msg_md->md_options & PTL_MD_KIOV) == 0 &&
                     ptlmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA &&
-                    ptlmsg->msg_md->md_length <= kranal_tunables.kra_max_immediate)
+                    ptlmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate)
                         break;
 
                 tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ);
@@ -702,7 +752,7 @@ kranal_do_send (ptl_ni_t        *ni,
         case PTL_MSG_PUT:
                 if (kiov == NULL &&             /* not paged */
                     nob <= RANAL_FMA_MAX_DATA && /* small enough */
-                    nob <= kranal_tunables.kra_max_immediate)
+                    nob <= *kranal_tunables.kra_max_immediate)
                         break;                  /* send IMMEDIATE */
 
                 tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ);
@@ -832,7 +882,11 @@ kranal_do_recv (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
                 }
 
                 tx->tx_conn = conn;
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                if (rc != 0) {
+                        kranal_tx_done(tx, rc);
+                        return PTL_FAIL;
+                }
 
                 tx->tx_msg.ram_u.putack.rapam_src_cookie =
                         conn->rac_rxmsg->ram_u.putreq.raprm_cookie;
@@ -1404,7 +1458,7 @@ kranal_process_fmaq (kra_conn_t *conn)
         int           expect_reply;
 
         /* NB 1. kranal_sendmsg() may fail if I'm out of credits right now.
-         *       However I will be rescheduled some by an FMA completion event
+         *       However I will be rescheduled by an FMA completion event
          *       when I eventually get some.
          * NB 2. Sampling rac_state here races with setting it elsewhere.
          *       But it doesn't matter if I try to send a "real" message just
@@ -1484,7 +1538,6 @@ kranal_process_fmaq (kra_conn_t *conn)
         case RANAL_MSG_IMMEDIATE:
                 rc = kranal_sendmsg(conn, &tx->tx_msg,
                                     tx->tx_buffer, tx->tx_nob);
-                expect_reply = 0;
                 break;
 
         case RANAL_MSG_PUT_NAK:
@@ -1492,13 +1545,16 @@ kranal_process_fmaq (kra_conn_t *conn)
         case RANAL_MSG_GET_NAK:
         case RANAL_MSG_GET_DONE:
                 rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
-                expect_reply = 0;
                 break;
 
         case RANAL_MSG_PUT_REQ:
+                rc = kranal_map_buffer(tx);
+                LASSERT (rc != -EAGAIN);
+                if (rc != 0)
+                        break;
+
                 tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie;
                 rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
-                kranal_map_buffer(tx);
                 expect_reply = 1;
                 break;
 
@@ -1508,7 +1564,11 @@ kranal_process_fmaq (kra_conn_t *conn)
                 break;
 
         case RANAL_MSG_GET_REQ:
-                kranal_map_buffer(tx);
+                rc = kranal_map_buffer(tx);
+                LASSERT (rc != -EAGAIN);
+                if (rc != 0)
+                        break;
+
                 tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie;
                 tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key;
                 tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits =
@@ -1529,10 +1589,8 @@ kranal_process_fmaq (kra_conn_t *conn)
                 return;
         }
 
-        LASSERT (rc == 0);
-
-        if (!expect_reply) {
-                kranal_tx_done(tx, 0);
+        if (!expect_reply || rc != 0) {
+                kranal_tx_done(tx, rc);
         } else {
                 /* LASSERT(current) above ensures this doesn't race with reply
                  * processing */
diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c
new file mode 100644 (file)
index 0000000..d2c6fe9
--- /dev/null
@@ -0,0 +1,149 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "ranal.h"
+
+static int n_connd = RANAL_N_CONND;
+CFS_MODULE_PARM(n_connd, "i", int, 0444,
+                "# of connection daemons");
+
+static int min_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = RANAL_MAX_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int ntx = RANAL_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of 'normal' transmit descriptors");
+
+static int ntx_nblk = RANAL_NTX_NBLK;
+CFS_MODULE_PARM(ntx_nblk, "i", int, 0444,
+               "# of 'reserved' transmit descriptors");
+
+static int fma_cq_size = RANAL_FMA_CQ_SIZE;
+CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
+               "size of the completion queue");
+
+static int timeout = RANAL_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "communications timeout (seconds)");
+
+static int listener_timeout = RANAL_LISTENER_TIMEOUT;
+CFS_MODULE_PARM(listener_timeout, "i", int, 0644,
+               "passive connection timeout");
+
+static int backlog = RANAL_BACKLOG;
+CFS_MODULE_PARM(backlog, "i", int, 0444,
+               "passive connection (listen) backlog");
+
+static int port = RANAL_PORT;
+CFS_MODULE_PARM(port, "i", int, 0444,
+               "connection request TCP/IP port");
+
+static int max_immediate = RANAL_MAX_IMMEDIATE;
+CFS_MODULE_PARM(max_immediate, "i", int, 0644,
+               "immediate/RDMA breakpoint");
+
+kra_tunables_t kranal_tunables = {
+       .kra_n_connd                = &n_connd,
+       .kra_min_reconnect_interval = &min_reconnect_interval,
+       .kra_max_reconnect_interval = &max_reconnect_interval,
+       .kra_ntx                    = &ntx,
+       .kra_ntx_nblk               = &ntx_nblk,
+       .kra_fma_cq_size            = &fma_cq_size,
+       .kra_timeout                = &timeout,
+       .kra_listener_timeout       = &listener_timeout,
+       .kra_backlog                = &backlog,
+       .kra_port                   = &port,
+       .kra_max_immediate          = &max_immediate,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table kranal_ctl_table[] = {
+       {1, "n_connd", &n_connd, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {3, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {5, "ntx_nblk", &ntx_nblk, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {6, "fma_cq_size", &fma_cq_size, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {8, "listener_timeout", &listener_timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {9, "backlog", &backlog, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "port", &port, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "max_immediate", &max_immediate, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kranal_top_ctl_table[] = {
+       {202, "ranal", NULL, 0, 0555, kranal_ctl_table},
+       {0}
+};
+
+int
+kranal_tunables_init ()
+{
+       kranal_tunables.kra_sysctl =
+               register_sysctl_table(kranal_top_ctl_table, 0);
+       
+       if (kranal_tunables.kra_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+       
+       return 0;
+}
+
+void
+kranal_tunables_fini ()
+{
+       if (kranal_tunables.kra_sysctl != NULL)
+               unregister_sysctl_table(kranal_tunables.kra_sysctl);
+}
+
+#else
+
+int
+kranal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kranal_tunables_fini ()
+{
+}
+
+#endif
+
index f60c567..9655ad9 100644 (file)
@@ -94,17 +94,19 @@ ksocknal_destroy_route (ksock_route_t *route)
         PORTAL_FREE (route, sizeof (*route));
 }
 
-ksock_peer_t *
-ksocknal_create_peer (ptl_ni_t *ni, ptl_nid_t nid)
+int
+ksocknal_create_peer (ksock_peer_t **peerp, ptl_ni_t *ni, ptl_nid_t nid)
 {
-        ksock_net_t  *net = ni->ni_data;
-        ksock_peer_t *peer;
+        ksock_net_t   *net = ni->ni_data;
+        ksock_peer_t  *peer;
+        unsigned long  flags;
 
         LASSERT (nid != PTL_NID_ANY);
+        LASSERT (!in_interrupt());
 
         PORTAL_ALLOC (peer, sizeof (*peer));
         if (peer == NULL)
-                return (NULL);
+                return -ENOMEM;
 
         memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
 
@@ -116,14 +118,29 @@ ksocknal_create_peer (ptl_ni_t *ni, ptl_nid_t nid)
         CFS_INIT_LIST_HEAD (&peer->ksnp_routes);
         CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue);
 
-        atomic_inc (&net->ksnn_npeers);
-        return (peer);
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+
+        if (net->ksnn_shutdown) {
+                spin_unlock_irqrestore(&net->ksnn_lock, flags);
+                
+                PORTAL_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
+
+        net->ksnn_npeers++;
+
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
+        *peerp = peer;
+        return 0;
 }
 
 void
 ksocknal_destroy_peer (ksock_peer_t *peer)
 {
-        ksock_net_t *net = peer->ksnp_ni->ni_data;
+        ksock_net_t    *net = peer->ksnp_ni->ni_data;
+        unsigned long   flags;
 
         CDEBUG (D_NET, "peer %s %p deleted\n", 
                 libcfs_nid2str(peer->ksnp_nid), peer);
@@ -139,7 +156,9 @@ ksocknal_destroy_peer (ksock_peer_t *peer)
          * until they are destroyed, so we can be assured that _all_ state to
          * do with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&net->ksnn_npeers);
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        net->ksnn_npeers--;
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
 }
 
 ksock_peer_t *
@@ -206,8 +225,8 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer)
 }
 
 int
-ksocknal_get_peer_info (int index, ptl_nid_t *nid,
-                        __u32 *myip, __u32 *peer_ip, int *port,
+ksocknal_get_peer_info (ptl_ni_t *ni, int index, 
+                        ptl_nid_t *nid, __u32 *myip, __u32 *peer_ip, int *port,
                         int *conn_count, int *share_count)
 {
         ksock_peer_t      *peer;
@@ -225,6 +244,9 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid,
                 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
                         if (peer->ksnp_n_passive_ips == 0 &&
                             list_empty(&peer->ksnp_routes)) {
                                 if (index-- > 0)
@@ -414,14 +436,15 @@ ksocknal_add_peer (ptl_ni_t *ni, ptl_nid_t nid, __u32 ipaddr, int port)
         ksock_peer_t      *peer2;
         ksock_route_t     *route;
         ksock_route_t     *route2;
+        int                rc;
 
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
 
         /* Have a brand new peer ready... */
-        peer = ksocknal_create_peer(ni, nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = ksocknal_create_peer(&peer, ni, nid);
+        if (rc != 0)
+                return rc;
 
         route = ksocknal_create_route (ipaddr, port);
         if (route == NULL) {
@@ -560,7 +583,7 @@ ksocknal_del_peer (ptl_ni_t *ni, ptl_nid_t nid, __u32 ip)
 }
 
 ksock_conn_t *
-ksocknal_get_conn_by_idx (int index)
+ksocknal_get_conn_by_idx (ptl_ni_t *ni, int index)
 {
         ksock_peer_t      *peer;
         struct list_head  *ptmp;
@@ -576,6 +599,9 @@ ksocknal_get_conn_by_idx (int index)
 
                         LASSERT (!peer->ksnp_closing);
 
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
                         list_for_each (ctmp, &peer->ksnp_conns) {
                                 if (index-- > 0)
                                         continue;
@@ -889,18 +915,11 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                 ksocknal_route_decref(newroute);
 }
 
-void
-ksocknal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
-
 int
 ksocknal_listener (void *arg)
 {
         char                name[16];
-        ksock_connreq_t    *cr;
+        ksock_connreq_t    *cr = NULL;
         int                 rc;
         unsigned long       flags;
 
@@ -911,9 +930,9 @@ ksocknal_listener (void *arg)
         kportal_daemonize(name);
         kportal_blockallsigs();
 
-        rc = ksocknal_lib_listen(&ksocknal_data.ksnd_listener_sock, 
-                                 *ksocknal_tunables.ksnd_port,
-                                 *ksocknal_tunables.ksnd_backlog);
+        rc = libcfs_sock_listen(&ksocknal_data.ksnd_listener_sock,
+                                0, *ksocknal_tunables.ksnd_port,
+                                *ksocknal_tunables.ksnd_backlog);
 
         /* set init status and unblock parent */
         ksocknal_data.ksnd_listener_shutdown = rc;
@@ -923,25 +942,40 @@ ksocknal_listener (void *arg)
                 return rc;
 
         while (ksocknal_data.ksnd_listener_shutdown == 0) {
+
+                if (cr == NULL) {
+                        PORTAL_ALLOC(cr, sizeof(*cr));
+                        if (cr == NULL) {
+                                CWARN("ENOMEM: listener pausing\n");
+                                libcfs_pause(cfs_time_seconds(1));
+                                continue;
+                        }
+                }
                 
-                rc = ksocknal_lib_accept(ksocknal_data.ksnd_listener_sock, &cr);
+                rc = libcfs_sock_accept(&cr->ksncr_sock, 
+                                        ksocknal_data.ksnd_listener_sock,
+                                        0);
                 if (rc != 0) {
                         if (rc != -EAGAIN) {
-                                CWARN("Accept error: %d\n", rc);
-                                ksocknal_pause(cfs_time_seconds(1));
+                                CWARN("Accept error %d: listener pausing\n",
+                                      rc);
+                                libcfs_pause(cfs_time_seconds(1));
                         }
                         continue;
                 }
-                
+
                 spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
 
                 list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
                 wake_up(&ksocknal_data.ksnd_connd_waitq);
                         
                 spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, flags);
+                cr = NULL;
         }
 
-        ksocknal_lib_release_sock(ksocknal_data.ksnd_listener_sock);
+        libcfs_sock_release(ksocknal_data.ksnd_listener_sock);
+        if (cr != NULL)
+                PORTAL_FREE(cr, sizeof(*cr));
 
         /* unblock executioner */
         mutex_up(&ksocknal_data.ksnd_listener_signal);
@@ -980,7 +1014,7 @@ ksocknal_stop_listener(void)
 
         /* make the listener exit its loop... */
         ksocknal_data.ksnd_listener_shutdown = 1;
-        ksocknal_lib_abort_accept(ksocknal_data.ksnd_listener_sock);
+        libcfs_sock_abort_accept(ksocknal_data.ksnd_listener_sock);
 
         /* block until listener exits */
         mutex_down(&ksocknal_data.ksnd_listener_signal);
@@ -993,6 +1027,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
         rwlock_t          *global_lock = &ksocknal_data.ksnd_global_lock;
         __u32              ipaddrs[PTL_MAX_INTERFACES];
         int                nipaddrs;
+        ptl_ni_t          *ni;
+        ksock_net_t       *net;
         ptl_nid_t          nid;
         struct list_head  *tmp;
         __u64              incarnation;
@@ -1043,8 +1079,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 goto failed_1;
 
         if (route != NULL) {
-                ptl_ni_t     *ni = route->ksnr_peer->ksnp_ni;
-                ksock_net_t  *net = ni->ni_data;
+                ni = route->ksnr_peer->ksnp_ni;
+                net = (ksock_net_t *)ni->ni_data;
 
                 /* Active connection sends HELLO eagerly */
                 nipaddrs = ksocknal_local_ipvec(ni, ipaddrs);
@@ -1057,9 +1093,14 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
         }
 
         /* Find out/confirm peer's NID and connection type and get the
-         * vector of interfaces she's willing to let me connect to */
+         * vector of interfaces she's willing to let me connect to.
+         * Passive connections use the listener timeout since the peer sends
+         * eagerly */
         nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid;
-        rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs);
+        rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs,
+                                  (route == NULL) ? 
+                                  *ksocknal_tunables.ksnd_listen_timeout :
+                                  *ksocknal_tunables.ksnd_timeout);
         if (rc < 0)
                 goto failed_1;
         nipaddrs = rc;
@@ -1069,7 +1110,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 peer = route->ksnr_peer;
                 ksocknal_peer_addref(peer);
         } else {
-                ptl_ni_t *ni = ptl_net2ni(PTL_NIDNET(nid));
+                ni = ptl_net2ni(PTL_NIDNET(nid));
 
                 if (ni == NULL) {
                         CERROR("Refusing connection attempt "
@@ -1077,8 +1118,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                         rc = -ECONNREFUSED;
                         goto failed_1;
                 }
-                
-                peer = ksocknal_create_peer(ni, nid);
+
+                net = (ksock_net_t *)ni->ni_data;
+                rc = ksocknal_create_peer(&peer, ni, nid);
 
                 /* lose extra ref from ptl_net2ni NB we wait for all the peers
                  * to be deleted before ni teardown can complete; i.e. ni can't
@@ -1086,10 +1128,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                  * there's no to account the peer's refs on ni. */
                 ptl_ni_decref(ni);
 
-                if (peer == NULL) {
-                        rc = -ENOMEM;
+                if (rc != 0)
                         goto failed_1;
-                }
 
                 write_lock_irqsave(global_lock, flags);
 
@@ -1251,7 +1291,7 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
         PORTAL_FREE (conn, sizeof(*conn));
 
  failed_0:
-        ksocknal_lib_release_sock(sock);
+        libcfs_sock_release(sock);
 
         LASSERT (rc != 0);
         return (rc);
@@ -1850,9 +1890,12 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg)
                 int          conn_count = 0;
                 int          share_count = 0;
 
-                rc = ksocknal_get_peer_info(data->ioc_count, &nid,
-                                            &myip, &ip, &port,
+                rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                            &nid, &myip, &ip, &port,
                                             &conn_count,  &share_count);
+                if (rc != 0)
+                        return rc;
+                        
                 data->ioc_nid    = nid;
                 data->ioc_count  = share_count;
                 data->ioc_u32[0] = ip;
@@ -1877,7 +1920,7 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg)
                 int           txmem;
                 int           rxmem;
                 int           nagle;
-                ksock_conn_t *conn = ksocknal_get_conn_by_idx (data->ioc_count);
+                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
 
                 if (conn == NULL)
                         return -ENOENT;
@@ -1903,15 +1946,14 @@ ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg)
                                                       data->ioc_u32[0]);
 
         case IOC_PORTAL_REGISTER_MYNID:
+                /* Ignore if this is a noop */
                if (data->ioc_nid == ni->ni_nid)
                        return 0;
 
-               LASSERT (PTL_NIDNET(data->ioc_nid) == PTL_NIDNET(ni->ni_nid));
-
-               CERROR("obsolete IOC_PORTAL_REGISTER_MYNID for %s(%s)\n",
+               CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n",
                       libcfs_nid2str(data->ioc_nid),
                       libcfs_nid2str(ni->ni_nid));
-               return 0;
+               return -EINVAL;
 
         case IOC_PORTAL_PUSH_CONNECTION:
                 return ksocknal_push (ni, data->ioc_nid);
@@ -1968,6 +2010,7 @@ ksocknal_base_shutdown (void)
 {
         ksock_sched_t *sched;
         int            i;
+        unsigned long  flags;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
@@ -1981,12 +2024,19 @@ ksocknal_base_shutdown (void)
                 ksocknal_stop_listener();
                 /* Wait for queued connreqs to clean up */
                 i = 2;
+                spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
                 while (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                        spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock,
+                                               flags);
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for connreqs to clean up\n");
-                        ksocknal_pause(cfs_time_seconds(1));
+                        libcfs_pause(cfs_time_seconds(1));
+
+                        spin_lock_irqsave(&ksocknal_data.ksnd_connd_lock, flags);
                 }
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_connd_lock, 
+                                       flags);
 
                 /* fall through */
 
@@ -2031,7 +2081,7 @@ ksocknal_base_shutdown (void)
                                "waiting for %d threads to terminate\n",
                                 ksocknal_data.ksnd_nthreads);
                         read_unlock(&ksocknal_data.ksnd_global_lock);
-                        ksocknal_pause(cfs_time_seconds(1));
+                        libcfs_pause(cfs_time_seconds(1));
                         read_lock(&ksocknal_data.ksnd_global_lock);
                 }
                 read_unlock(&ksocknal_data.ksnd_global_lock);
@@ -2220,22 +2270,33 @@ ksocknal_shutdown (ptl_ni_t *ni)
 {
         ksock_net_t    *net = ni->ni_data;
         int             i;
+        unsigned long   flags;
 
         LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
         LASSERT(ksocknal_data.ksnd_nnets > 0);
 
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        net->ksnn_shutdown = 1;                 /* prevent new peers */
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
         /* Delete all peers */
         ksocknal_del_peer(ni, PTL_NID_ANY, 0);
 
         /* Wait for all peer state to clean up */
         i = 2;
-        while (atomic_read (&net->ksnn_npeers) != 0) {
+        spin_lock_irqsave(&net->ksnn_lock, flags);
+        while (net->ksnn_npeers != 0) {
+                spin_unlock_irqrestore(&net->ksnn_lock, flags);
+
                 i++;
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                        "waiting for %d peers to disconnect\n",
-                       atomic_read (&net->ksnn_npeers));
-                ksocknal_pause(cfs_time_seconds(1));
+                       net->ksnn_npeers);
+                libcfs_pause(cfs_time_seconds(1));
+
+                spin_lock_irqsave(&net->ksnn_lock, flags);
         }
+        spin_unlock_irqrestore(&net->ksnn_lock, flags);
 
         for (i = 0; i < net->ksnn_ninterfaces; i++) {
                 LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
@@ -2249,6 +2310,61 @@ ksocknal_shutdown (ptl_ni_t *ni)
                 ksocknal_base_shutdown();
 }
 
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+        char      **names;
+        int         i;
+        int         j;
+        int         rc;
+        int         n;
+                
+        n = libcfs_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Can't enumerate interfaces: %d\n", n);
+                return n;
+        }
+
+        for (i = j = 0; i < n; i++) {
+                int        up;
+                __u32      ip;
+                __u32      mask;
+                
+                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                        continue;
+                
+                rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+                if (rc != 0) {
+                        CWARN("Can't get interface %s info: %d\n",
+                              names[i], rc);
+                        continue;
+                }
+                
+                if (!up) {
+                        CWARN("Ignoring interface %s (down)\n",
+                              names[i]);
+                        continue;
+                }
+
+                if (j == PTL_MAX_INTERFACES) {
+                        CWARN("Ignoring interface %s (too many interfaces)\n",
+                              names[i]);
+                        continue;
+                }
+
+                net->ksnn_interfaces[j].ksni_ipaddr = ip;
+                net->ksnn_interfaces[j].ksni_netmask = mask;
+                j++;
+        }
+
+        libcfs_ipif_free_enumeration(names, n);
+        
+        if (j == 0)
+                CERROR("Can't find any usable interfaces\n");
+        
+        return j;
+}
+
 ptl_err_t
 ksocknal_startup (ptl_ni_t *ni)
 {
@@ -2269,31 +2385,39 @@ ksocknal_startup (ptl_ni_t *ni)
                 goto fail_0;
                 
         memset(net, 0, sizeof(*net));
-        
+        spin_lock_init(&net->ksnn_lock);
         net->ksnn_incarnation = ksocknal_new_incarnation();
         ni->ni_data = net;
 
         if (ni->ni_interfaces[0] == NULL) {
-                rc = ksocknal_lib_enumerate_ifs(net->ksnn_interfaces,
-                                                PTL_MAX_INTERFACES);
-                if (rc < 0)
+                rc = ksocknal_enumerate_interfaces(net);
+                if (rc <= 0)
                         goto fail_1;
 
-                if (rc == 0) {
-                        CERROR("Can't find any interfaces\n");
-                        goto fail_1;
-                }
-                
                 net->ksnn_ninterfaces = rc;
         } else {
                 for (i = 0; i < PTL_MAX_INTERFACES; i++) {
+                        int    up;
+
                         if (ni->ni_interfaces[i] == NULL)
                                 break;
 
-                        rc = ksocknal_lib_init_if(&net->ksnn_interfaces[i],
-                                                  ni->ni_interfaces[i]);
-                        if (rc != 0)
+                        rc = libcfs_ipif_query(
+                                ni->ni_interfaces[i], &up,
+                                &net->ksnn_interfaces[i].ksni_ipaddr,
+                                &net->ksnn_interfaces[i].ksni_netmask);
+                        
+                        if (rc != 0) {
+                                CERROR("Can't get interface %s info: %d\n",
+                                       ni->ni_interfaces[i], rc);
                                 goto fail_1;
+                        }
+                        
+                        if (!up) {
+                                CERROR("Interface %s is down\n",
+                                       ni->ni_interfaces[i]);
+                                goto fail_1;
+                        }
                 }
                 net->ksnn_ninterfaces = i;
         }
index 0014402..7245899 100644 (file)
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
+/* minimum socket buffer required for connection handshake */
+#define SOCKNAL_MIN_BUFFER   (2*(sizeof(ptl_hdr_t) +                    \
+                                 PTL_MAX_INTERFACES * sizeof(__u32)))
+
 typedef struct                                  /* pool of forwarding buffers */
 {
         spinlock_t        fmp_lock;             /* serialise */
@@ -158,7 +162,9 @@ typedef struct
 typedef struct
 {
         __u64             ksnn_incarnation;     /* my epoch */
-        atomic_t          ksnn_npeers;          /* # peers */
+        spinlock_t        ksnn_lock;            /* serialise */
+        int               ksnn_npeers;          /* # peers */
+        int               ksnn_shutdown;        /* shutting down? */
         int               ksnn_ninterfaces;     /* IP interfaces */
         ksock_interface_t ksnn_interfaces[PTL_MAX_INTERFACES];
 } ksock_net_t;
@@ -434,15 +440,13 @@ ksocknal_connsock_addref (ksock_conn_t *conn)
         return (rc);
 }
 
-extern void ksocknal_lib_release_sock(struct socket *sock);
-
 static inline void
 ksocknal_connsock_decref (ksock_conn_t *conn)
 {
         LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
         if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
                 LASSERT (conn->ksnc_closing);
-                ksocknal_lib_release_sock(conn->ksnc_sock);
+                libcfs_sock_release(conn->ksnc_sock);
                 conn->ksnc_sock = NULL;
         }
 }
@@ -531,7 +535,8 @@ extern int ksocknal_reaper (void *arg);
 extern int ksocknal_send_hello (ksock_conn_t *conn, ptl_nid_t nid,
                                 __u64 incarnation, __u32 *ipaddrs, int nipaddrs);
 extern int ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, 
-                                __u64 *incarnation, __u32 *ipaddrs);
+                                __u64 *incarnation, __u32 *ipaddrs,
+                                int timeout);
 
 extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
 extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
@@ -547,21 +552,8 @@ extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
 extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
 extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
 extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
-extern int ksocknal_lib_sock_write (struct socket *sock, 
-                                    void *buffer, int nob);
-extern int ksocknal_lib_sock_read (struct socket *sock, 
-                                   void *buffer, int nob);
 extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, 
                                            int *rxmem, int *nagle);
-extern int ksocknal_lib_connect_sock(struct socket **sockp, int *fatal,
-                                     ksock_route_t *route, int local_port);
-extern int ksocknal_lib_listen(struct socket **sockp, int port, int backlog);
-extern int ksocknal_lib_accept(struct socket *sock, ksock_connreq_t **crp);
-extern void ksocknal_lib_abort_accept(struct socket *sock);
 
 extern int ksocknal_lib_tunables_init(void);
 extern void ksocknal_lib_tunables_fini(void);
-
-extern int ksocknal_lib_init_if (ksock_interface_t *iface, char *name);
-extern int ksocknal_lib_enumerate_ifs (ksock_interface_t *ifs, int nifs);
-
index 25af875..f6bcac0 100644 (file)
@@ -689,7 +689,7 @@ ksocknal_launch_packet (ptl_ni_t *ni, ksock_tx_t *tx, ptl_nid_t nid)
         ksock_conn_t     *conn;
         ksock_route_t    *route;
         rwlock_t         *g_lock;
-        int               create_peer = 0;
+        int               retry;
         int               rc;
         
         /* Ensure the frags we've been given EXACTLY match the number of
@@ -713,50 +713,47 @@ ksocknal_launch_packet (ptl_ni_t *ni, ksock_tx_t *tx, ptl_nid_t nid)
 
         g_lock = &ksocknal_data.ksnd_global_lock;
         
- again:
-        if (create_peer) {
-                rc = ksocknal_add_peer(ni, nid, PTL_NIDADDR(nid),
-                                       *ksocknal_tunables.ksnd_port);
-                if (rc != 0) {
-                        CERROR("Can't add peer %s: %d\n",
-                               libcfs_nid2str(nid), rc);
-                        return rc;
-                }
-        }
-        
+        for (retry = 0;; retry) {
 #if !SOCKNAL_ROUND_ROBIN
-        read_lock (g_lock);
-        peer = ksocknal_find_peer_locked(ni, nid);
-        if (peer != NULL) {
-                if (ksocknal_find_connectable_route_locked(peer) == NULL) {
-                        conn = ksocknal_find_conn_locked (tx, peer);
-                        if (conn != NULL) {
-                                /* I've got no routes that need to be
-                                 * connecting and I do have an actual
-                                 * connection... */
-                                ksocknal_queue_tx_locked (tx, conn);
-                                read_unlock (g_lock);
-                                return (0);
+                read_lock (g_lock);
+                peer = ksocknal_find_peer_locked(ni, nid);
+                if (peer != NULL) {
+                        if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+                                conn = ksocknal_find_conn_locked (tx, peer);
+                                if (conn != NULL) {
+                                        /* I've got no routes that need to be
+                                         * connecting and I do have an actual
+                                         * connection... */
+                                        ksocknal_queue_tx_locked (tx, conn);
+                                        read_unlock (g_lock);
+                                        return (0);
+                                }
                         }
                 }
-        }
  
-        /* I'll need a write lock... */
-        read_unlock (g_lock);
+                /* I'll need a write lock... */
+                read_unlock (g_lock);
 #endif
-        write_lock_irqsave(g_lock, flags);
+                write_lock_irqsave(g_lock, flags);
 
-        peer = ksocknal_find_peer_locked(ni, nid);
-        if (peer == NULL) {
+                peer = ksocknal_find_peer_locked(ni, nid);
+                if (peer != NULL) 
+                        break;
+                
                 write_unlock_irqrestore(g_lock, flags);
 
-                if (create_peer) {
+                if (retry) {
                         CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
-                        return -ENOENT;
+                        return -EHOSTUNREACH;
                 }
                 
-                create_peer = 1;
-                goto again;
+                rc = ksocknal_add_peer(ni, nid, PTL_NIDADDR(nid),
+                                       *ksocknal_tunables.ksnd_port);
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        return rc;
+                }
         }
 
         for (;;) {
@@ -1710,8 +1707,9 @@ ksocknal_send_hello (ksock_conn_t *conn,
                 ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]);
         }
 
-        /* Receiver should be eager */
-        rc = ksocknal_lib_sock_write (sock, &hdr, sizeof(hdr));
+        /* socket buffer should have been set large enough not to block
+         * (timeout == 0) */
+        rc = libcfs_sock_write(sock, &hdr, sizeof(hdr), 0);
         if (rc != 0) {
                 CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
                         rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
@@ -1721,7 +1719,7 @@ ksocknal_send_hello (ksock_conn_t *conn,
         if (nipaddrs == 0)
                 return (0);
         
-        rc = ksocknal_lib_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs));
+        rc = libcfs_sock_write(sock, ipaddrs, nipaddrs * sizeof(*ipaddrs), 0);
         if (rc != 0)
                 CERROR ("Error %d sending HELLO payload (%d)"
                         " to %u.%u.%u.%u/%d\n", rc, nipaddrs, 
@@ -1748,7 +1746,8 @@ ksocknal_invert_type(int type)
 
 int
 ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
-                     __u64 *incarnation, __u32 *ipaddrs)
+                     __u64 *incarnation, __u32 *ipaddrs,
+                     int timeout)
 {
         struct socket      *sock = conn->ksnc_sock;
         int                 rc;
@@ -1761,7 +1760,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         hmv = (ptl_magicversion_t *)&hdr.dest_nid;
         LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
 
-        rc = ksocknal_lib_sock_read (sock, hmv, sizeof (*hmv));
+        rc = libcfs_sock_read(sock, hmv, sizeof (*hmv), timeout);
         if (rc != 0) {
                 CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
                         rc, HIPQUAD(conn->ksnc_ipaddr));
@@ -1794,7 +1793,8 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
          * header, followed by payload full of interface IP addresses.
          * Read the rest of it in now... */
 
-        rc = ksocknal_lib_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        rc = libcfs_sock_read(sock, hmv + 1, sizeof (hdr) - sizeof (*hmv), 
+                              timeout);
         if (rc != 0) {
                 CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
                         rc, HIPQUAD(conn->ksnc_ipaddr));
@@ -1867,7 +1867,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         if (nips == 0)
                 return (0);
         
-        rc = ksocknal_lib_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs));
+        rc = libcfs_sock_read(sock, ipaddrs, nips * sizeof(*ipaddrs), timeout);
         if (rc != 0) {
                 CERROR ("Error %d reading IPs from %s ip %u.%u.%u.%u\n",
                         rc, libcfs_nid2str(*nid), HIPQUAD(conn->ksnc_ipaddr));
@@ -1895,15 +1895,12 @@ ksocknal_connect_peer (ksock_route_t *route, int type)
         int                 port;
         int                 fatal;
         
-        /* Iterate through reserved ports.  When typed connections are
-         * used, we will need to bind to multiple ports, but we only know
-         * this at connect time.  But, by that time we've already called
-         * bind() so we need a new socket. */
-
         for (port = 1023; port > 512; --port) {
+                /* Iterate through reserved ports. */
 
-                rc = ksocknal_lib_connect_sock(&sock, &fatal, route, port);
-
+                rc = libcfs_sock_connect(&sock, &fatal, 0,
+                                         route->ksnr_myipaddr, port,
+                                         route->ksnr_ipaddr, route->ksnr_port);
                 if (rc == 0) {
                         rc = ksocknal_create_conn(route, sock, type);
                         return rc;
index 7dc2287..d82e4b8 100644 (file)
@@ -487,84 +487,6 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                set_fs (KERNEL_DS);
-                rc = sock_sendmsg (sock, &msg, iov.iov_len);
-                set_fs (oldmm);
-
-                if (rc < 0)
-                        return (rc);
-
-                if (rc == 0) {
-                        CERROR ("Unexpected zero rc\n");
-                        return (-ECONNABORTED);
-                }
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-        }
-
-        return (0);
-}
-
-int
-ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                set_fs (KERNEL_DS);
-                rc = sock_recvmsg (sock, &msg, iov.iov_len, 0);
-                set_fs (oldmm);
-
-                if (rc < 0)
-                        return (rc);
-
-                if (rc == 0)
-                        return (-ECONNABORTED);
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-        }
-
-        return (0);
-}
-
-int
 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
         mm_segment_t   oldmm = get_fs ();
@@ -579,23 +501,15 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int
                 return (-ESHUTDOWN);
         }
 
-        set_fs (KERNEL_DS);
-
-        len = sizeof(*txmem);
-        rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)txmem, &len);
-        if (rc == 0) {
-                len = sizeof(*rxmem);
-                rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF,
-                                     (char *)rxmem, &len);
-        }
+       rc = libcfs_sock_getbuf(sock, txmem, rxmem);
         if (rc == 0) {
                 len = sizeof(*nagle);
+               set_fs(KERNEL_DS);
                 rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
                                            (char *)nagle, &len);
+               set_fs(oldmm);
         }
 
-        set_fs (oldmm);
         ksocknal_connsock_decref(conn);
 
         if (rc == 0)
@@ -607,6 +521,20 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int
 }
 
 int
+ksocknal_lib_buffersize (int current_sz, int tunable_sz)
+{
+       /* ensure >= SOCKNAL_MIN_BUFFER */
+       if (current_sz < SOCKNAL_MIN_BUFFER)
+               return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
+
+       if (tunable_sz > SOCKNAL_MIN_BUFFER)
+               return tunable_sz;
+       
+       /* leave alone */
+       return 0;
+}
+
+int
 ksocknal_lib_setup_sock (struct socket *sock)
 {
         mm_segment_t    oldmm = get_fs ();
@@ -616,6 +544,8 @@ ksocknal_lib_setup_sock (struct socket *sock)
         int             keep_intvl;
         int             keep_count;
         int             do_keepalive;
+       int             sndbuf;
+       int             rcvbuf;
         struct linger   linger;
 
         sock->sk->sk_allocation = GFP_NOFS;
@@ -658,29 +588,23 @@ ksocknal_lib_setup_sock (struct socket *sock)
                 }
         }
 
-        if (ksocknal_tunables.ksnd_buffer_size > 0) {
-                option = *ksocknal_tunables.ksnd_buffer_size;
+       rc = libcfs_sock_getbuf(sock, &sndbuf, &rcvbuf);
+       if (rc != 0) {
+               CERROR("Can't get buffer sizes: %d\n", rc);
+               return rc;
+       }
 
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set send buffer %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
+       sndbuf = ksocknal_lib_buffersize(sndbuf,
+                                        *ksocknal_tunables.ksnd_buffer_size);
+       rcvbuf = ksocknal_lib_buffersize(rcvbuf,
+                                        *ksocknal_tunables.ksnd_buffer_size);
 
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set receive buffer %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
-        }
+       rc = libcfs_sock_setbuf(sock, sndbuf, rcvbuf);
+       if (rc != 0) {
+               CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                       sndbuf, rcvbuf, rc);
+               return (rc);
+       }
 
         /* snapshot tunables */
         keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
@@ -732,409 +656,6 @@ ksocknal_lib_setup_sock (struct socket *sock)
         return (0);
 }
 
-int
-ksocknal_lib_set_sock_timeout (struct socket *sock, int timeout)
-{
-       struct timeval tv;
-       int            rc;
-        mm_segment_t   oldmm = get_fs();
-
-        /* Set the socket timeouts, so our connection handshake completes in
-         * finite time */
-        tv.tv_sec = timeout;
-        tv.tv_usec = 0;
-
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set send timeout %d: %d\n", timeout, rc);
-               return rc;
-        }
-
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set receive timeout %d: %d\n", timeout, rc);
-               return rc;
-        }
-
-       return 0;
-}
-
-void
-ksocknal_lib_release_sock(struct socket *sock)
-{
-       sock_release(sock);
-}
-
-int
-ksocknal_lib_create_sock(struct socket **sockp, int *fatal,
-                        int timeout, __u32 local_ip, int local_port)
-{
-        struct sockaddr_in  locaddr;
-        struct socket      *sock;
-        int                 rc;
-        int                 option;
-        mm_segment_t        oldmm = get_fs();
-
-       *fatal = 1;                             /* assume errors are fatal */
-
-        memset(&locaddr, 0, sizeof(locaddr));
-        locaddr.sin_family = AF_INET;
-        locaddr.sin_port = htons(local_port);
-        locaddr.sin_addr.s_addr = (local_ip == 0) ? 
-                                 INADDR_ANY : htonl(local_ip);
-
-        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
-        *sockp = sock;
-        if (rc != 0) {
-                CERROR ("Can't create socket: %d\n", rc);
-                return (rc);
-        }
-
-       rc = ksocknal_lib_set_sock_timeout(sock, timeout);
-       if (rc != 0)
-               goto failed;
-
-        set_fs (KERNEL_DS);
-        option = 1;
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof (option));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
-                goto failed;
-        }
-
-        rc = sock->ops->bind(sock,
-                             (struct sockaddr *)&locaddr, sizeof(locaddr));
-        if (rc == -EADDRINUSE) {
-                CDEBUG(D_WARNING, "Port %d already in use\n", local_port);
-                *fatal = 0;
-                goto failed;
-        }
-        if (rc != 0) {
-                CERROR("Error trying to bind to reserved port %d: %d\n",
-                       local_port, rc);
-                goto failed;
-        }
-
-       return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-int
-ksocknal_lib_listen(struct socket **sockp, int port, int backlog)
-{
-       int      fatal;
-       int      rc;
-
-       rc = ksocknal_lib_create_sock(sockp, &fatal, 1, 0, port);
-       if (rc != 0)
-               return rc;
-
-       rc = (*sockp)->ops->listen(*sockp, backlog);
-       if (rc == 0)
-               return 0;
-       
-       CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
-       sock_release(*sockp);
-       return rc;
-}
-
-int
-ksocknal_lib_accept(struct socket *sock, ksock_connreq_t **crp)
-{
-       wait_queue_t   wait;
-       struct socket *newsock;
-       int            rc;
-
-       init_waitqueue_entry(&wait, current);
-
-       newsock = sock_alloc();
-       if (newsock == NULL) {
-               CERROR("Can't allocate socket\n");
-               return -ENOMEM;
-       }
-
-       /* XXX this should add a ref to sock->ops->owner, if
-        * TCP could be a module */
-       newsock->type = sock->type;
-       newsock->ops = sock->ops;
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       add_wait_queue(sock->sk->sk_sleep, &wait);
-       
-       rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
-       if (rc == -EAGAIN) {
-               /* Nothing ready, so wait for activity */
-               schedule();
-               rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
-       }
-       
-       remove_wait_queue(sock->sk->sk_sleep, &wait);
-       set_current_state(TASK_RUNNING);
-
-       if (rc != 0)
-               goto failed;
-       
-       rc = ksocknal_lib_set_sock_timeout(newsock, 
-                                          *ksocknal_tunables.ksnd_listen_timeout);
-       if (rc != 0)
-               goto failed;
-       
-       rc = -ENOMEM;
-       PORTAL_ALLOC(*crp, sizeof(**crp));
-       if (*crp == NULL)
-               goto failed;
-
-       (*crp)->ksncr_sock = newsock;
-       return 0;
-
- failed:
-       sock_release(newsock);
-       return rc;
-}
-
-void
-ksocknal_lib_abort_accept(struct socket *sock)
-{
-       wake_up_all(sock->sk->sk_sleep);
-}
-
-int
-ksocknal_lib_connect_sock(struct socket **sockp, int *fatal,
-                         ksock_route_t *route, int local_port)
-{
-        struct sockaddr_in  srvaddr;
-        int                 rc;
-
-        memset (&srvaddr, 0, sizeof (srvaddr));
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons (route->ksnr_port);
-        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
-       rc = ksocknal_lib_create_sock(sockp, fatal,
-                                     *ksocknal_tunables.ksnd_timeout,
-                                     route->ksnr_myipaddr, local_port);
-       if (rc != 0)
-               return rc;
-
-        rc = (*sockp)->ops->connect(*sockp,
-                                   (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                   0);
-        if (rc == 0)
-                return 0;
-
-        /* EADDRNOTAVAIL probably means we're already connected to the same
-         * peer/port on the same local port on a differently typed
-         * connection.  Let our caller retry with a different local
-         * port... */
-        *fatal = !(rc == -EADDRNOTAVAIL);
-
-        CDEBUG(*fatal ? D_ERROR : D_NET,
-               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
-               HIPQUAD(route->ksnr_myipaddr), local_port,
-               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-
-       sock_release(*sockp);
-        return rc;
-}
-
-int
-ksocknal_lib_getifaddr(struct socket *sock, char *name, __u32 *ip, __u32 *mask)
-{
-       mm_segment_t   oldmm = get_fs();
-       struct ifreq   ifr;
-       int            rc;
-       __u32          val;
-
-       LASSERT (strnlen(name, IFNAMSIZ) < IFNAMSIZ);
-       
-       strcpy(ifr.ifr_name, name);
-       ifr.ifr_addr.sa_family = AF_INET;
-       set_fs(KERNEL_DS);
-       rc = sock->ops->ioctl(sock, SIOCGIFADDR, (unsigned long)&ifr);
-       set_fs(oldmm);
-
-       if (rc != 0)
-               return rc;
-
-       val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
-       *ip = ntohl(val);
-
-       strcpy(ifr.ifr_name, name);
-       ifr.ifr_addr.sa_family = AF_INET;
-       set_fs(KERNEL_DS);
-       rc = sock->ops->ioctl(sock, SIOCGIFNETMASK, (unsigned long)&ifr);
-       set_fs(oldmm);
-       
-       if (rc != 0)
-               return rc;
-       
-       val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
-       *mask = ntohl(val);
-       return 0;
-}
-
-int
-ksocknal_lib_init_if (ksock_interface_t *iface, char *name)
-{
-       struct socket  *sock;
-       int             rc;
-       int             nob;
-
-       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
-       if (rc != 0) {
-               CERROR ("Can't create socket: %d\n", rc);
-               return rc;
-       }
-       
-       nob = strnlen(name, IFNAMSIZ);
-       if (nob == IFNAMSIZ) {
-               CERROR("Interface name %s too long\n", name);
-               rc -EINVAL;
-       } else {
-               CLASSERT (sizeof(iface->ksni_name) >= IFNAMSIZ);
-               strcpy(iface->ksni_name, name);
-
-               rc = ksocknal_lib_getifaddr(sock, name,
-                                           &iface->ksni_ipaddr,
-                                           &iface->ksni_netmask);
-               if (rc != 0)
-                       CERROR("Can't get IP address for interface %s\n", name);
-       }
-
-       sock_release(sock);
-       return rc;
-}
-
-int
-ksocknal_lib_enumerate_ifs (ksock_interface_t *ifs, int nifs)
-{
-       int             nalloc = PTL_MAX_INTERFACES;
-       char            name[IFNAMSIZ];
-       int             nfound;
-       int             nused;
-       struct socket  *sock;
-       struct ifconf   ifc;
-       struct ifreq   *ifr;
-       mm_segment_t    oldmm = get_fs();
-       __u32           ipaddr;
-       __u32           netmask;
-       int             rc;
-       int             i;
-
-       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
-       if (rc != 0) {
-               CERROR ("Can't create socket: %d\n", rc);
-               return rc;
-       }
-
-       for (;;) {
-               PORTAL_ALLOC(ifr, nalloc * sizeof(*ifr));
-               if (ifr == NULL) {
-                       CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
-                       rc = -ENOMEM;
-                       goto out0;
-               }
-               
-               ifc.ifc_buf = (char *)ifr;
-               ifc.ifc_len = nalloc * sizeof(*ifr);
-               
-               set_fs(KERNEL_DS);
-               rc = sock->ops->ioctl(sock, SIOCGIFCONF, (unsigned long)&ifc);
-               set_fs(oldmm);
-
-               if (rc < 0) {
-                       CERROR ("Error %d enumerating interfaces\n", rc);
-                       goto out1;
-               }
-               
-               LASSERT (rc == 0);
-
-               nfound = ifc.ifc_len/sizeof(*ifr);
-               LASSERT (nfound <= nalloc);
-               
-               if (nfound <= nalloc)
-                       break;
-               
-               /* Assume there are more interfaces */
-               if (nalloc >= 16 * PTL_MAX_INTERFACES) {
-                       CWARN("Too many interfaces: "
-                             "only trying the first %d\n", nfound);
-                       break;
-               }
-
-               nalloc *= 2;
-       }
-
-       for (i = nused = 0; i < nfound; i++) {
-               strncpy(name, ifr[i].ifr_name, IFNAMSIZ); /* ensure terminated name */
-               name[IFNAMSIZ-1] = 0;
-               
-               if (!strncmp(name, "lo", 2)) {
-                       CDEBUG(D_WARNING, "ignoring %s\n", name);
-                       continue;
-               }
-               
-               strcpy(ifr[i].ifr_name, name);
-               set_fs(KERNEL_DS);
-               rc = sock->ops->ioctl(sock, SIOCGIFFLAGS, 
-                                     (unsigned long)&ifr[i]);
-               set_fs(oldmm);
-
-               if (rc != 0) {
-                       CDEBUG(D_WARNING, "Can't get flags for %s\n", name);
-                       continue;
-               }
-
-               if ((ifr[i].ifr_flags & IFF_UP) == 0) {
-                       CDEBUG(D_WARNING, "Interface %s down\n", name);
-                       continue;
-               }
-
-               rc = ksocknal_lib_getifaddr(sock, name, &ipaddr, &netmask);
-               if (rc != 0) {
-                       CDEBUG(D_WARNING, 
-                              "Can't get IP address or netmask for %s\n",
-                              name);
-                       continue;
-               }
-
-               if (nused >= nifs) {
-                       CWARN("Too many available interfaces: "
-                             "only using the first %d\n", nused);
-                       break;
-               }
-               
-               memset(&ifs[nused], 0, sizeof(ifs[nused]));
-               
-               CLASSERT(sizeof(ifs[nused].ksni_name) >= IFNAMSIZ);
-               strcpy(ifs[nused].ksni_name, name);
-               ifs[nused].ksni_ipaddr = ipaddr;
-               ifs[nused].ksni_netmask = netmask;
-               nused++;
-
-               CDEBUG(D_WARNING, "Added interface %s: %u.%u.%u.%u\n", 
-                      name, HIPQUAD(ipaddr));
-       }
-
-       rc = nused;
- out1:
-       PORTAL_FREE(ifr, nalloc * sizeof(*ifr));
- out0:
-       sock_release(sock);
-       return rc;
-}
-
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 struct tcp_opt *sock2tcp_opt(struct sock *sk)
 {
index 982bcdb..827e625 100644 (file)
 
 #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
-# define sk_allocation  allocation
-# define sk_data_ready data_ready
-# define sk_write_space write_space
-# define sk_user_data   user_data
-# define sk_prot        prot
-# define sk_sndbuf      sndbuf
-# define sk_socket      socket
-# define sk_sleep       sleep
-#endif
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
-# define sk_wmem_queued wmem_queued
-# define sk_err         err
-#endif
-
 #define SOCKNAL_ARCH_EAGER_ACK 0
-#define SOCK_WMEM_QUEUED(so)    ((so)->sk->sk_wmem_queued)
-#define SOCK_ERROR(so)          ((so)->sk->sk_err)
-#define SOCK_TEST_NOSPACE(so)  test_bit(SOCK_NOSPACE, &(so)->flags)
 
 #ifndef CONFIG_SMP
 static inline
index db09995..88f523d 100644 (file)
@@ -2,7 +2,7 @@ MODULES = libcfs
 
 libcfs-linux-objs := linux-tracefile.o linux-debug.o
 libcfs-linux-objs += linux-prim.o linux-mem.o
-libcfs-linux-objs += linux-fs.o linux-sync.o
+libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o
 libcfs-linux-objs += linux-lwt.o linux-proc.o linux-curproc.o
 libcfs-linux-objs += linux-utils.o linux-module.o
 
index 49f8e87..8bf35cc 100644 (file)
@@ -1,4 +1,4 @@
 EXTRA_DIST := linux-debug.c linux-lwt.c linux-prim.c linux-tracefile.c \
        linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c  \
-       linux-module.c linux-sync.c linux-curproc.c
+       linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c
 
diff --git a/lnet/libcfs/linux/linux-tcpip.c b/lnet/libcfs/linux/linux-tcpip.c
new file mode 100644 (file)
index 0000000..1c996b2
--- /dev/null
@@ -0,0 +1,656 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <libcfs/kp30.h>
+#include <libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+       mm_segment_t   oldmm = get_fs();
+       struct ifreq   ifr;
+        struct socket *sock;
+        int            nob;
+       int            rc;
+       __u32          val;
+
+       rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return rc;
+       }
+       
+       nob = strnlen(name, IFNAMSIZ);
+       if (nob == IFNAMSIZ) {
+               CERROR("Interface name %s too long\n", name);
+               rc -EINVAL;
+               goto out;
+       }
+       
+       CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+       strcpy(ifr.ifr_name, name);
+       set_fs(KERNEL_DS);
+       rc = sock->ops->ioctl(sock, SIOCGIFFLAGS, (unsigned long)&ifr);
+       set_fs(oldmm);
+
+       if (rc != 0) {
+               CERROR("Can't get flags for interface %s\n", name);
+               goto out;
+       }
+
+       if ((ifr.ifr_flags & IFF_UP) == 0) {
+               CDEBUG(D_NET, "Interface %s down\n", name);
+               *up = 0;
+               *ip = *mask = 0;
+               goto out;
+       }
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       set_fs(KERNEL_DS);
+       rc = sock->ops->ioctl(sock, SIOCGIFADDR, (unsigned long)&ifr);
+       set_fs(oldmm);
+
+       if (rc != 0) {
+               CERROR("Can't get IP address for interface %s\n", name);
+               goto out;
+       }
+       
+       val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+       *ip = ntohl(val);
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       set_fs(KERNEL_DS);
+       rc = sock->ops->ioctl(sock, SIOCGIFNETMASK, (unsigned long)&ifr);
+       set_fs(oldmm);
+       
+       if (rc != 0) {
+               CERROR("Can't get netmask for interface %s\n", name);
+               goto out;
+       }
+       
+       val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+       *mask = ntohl(val);
+
+ out:
+       sock_release(sock);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+       /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+       int             toobig;
+       int             nalloc;
+       int             nfound;
+       struct socket  *sock;
+       struct ifreq   *ifr;
+       struct ifconf   ifc;
+       mm_segment_t    oldmm = get_fs();
+       int             rc;
+       int             nob;
+       int             i;
+
+       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return rc;
+       }
+
+       nalloc = 16;    /* first guess at max interfaces */
+       toobig = 0;
+       for (;;) {
+               if (nalloc * sizeof(*ifr) > PAGE_SIZE) {
+                       toobig = 1;
+                       nalloc = PAGE_SIZE/sizeof(*ifr);
+                       CWARN("Too many interfaces: only enumerating first %d\n",
+                             nalloc);
+               }
+
+               PORTAL_ALLOC(ifr, nalloc * sizeof(*ifr));
+               if (ifr == NULL) {
+                       CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                       rc = -ENOMEM;
+                       goto out0;
+               }
+               
+               ifc.ifc_buf = (char *)ifr;
+               ifc.ifc_len = nalloc * sizeof(*ifr);
+               
+               set_fs(KERNEL_DS);
+               rc = sock->ops->ioctl(sock, SIOCGIFCONF, (unsigned long)&ifc);
+               set_fs(oldmm);
+
+               if (rc < 0) {
+                       CERROR ("Error %d enumerating interfaces\n", rc);
+                       goto out1;
+               }
+               
+               LASSERT (rc == 0);
+
+               nfound = ifc.ifc_len/sizeof(*ifr);
+               LASSERT (nfound <= nalloc);
+
+               if (nfound < nalloc || toobig)
+                       break;
+
+                PORTAL_FREE(ifr, nalloc * sizeof(*ifr));
+               nalloc *= 2;
+       }
+
+        if (nfound == 0)
+                goto out1;
+
+        PORTAL_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+                
+       for (i = 0; i < nfound; i++) {
+
+               nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+               if (nob == IFNAMSIZ) {
+                       /* no space for terminating NULL */
+                       CERROR("interface name %.*s too long (%d max)\n", 
+                              nob, ifr[i].ifr_name, IFNAMSIZ);
+                       rc = -ENAMETOOLONG;
+                        goto out2;
+               }
+
+                PORTAL_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+               memcpy(names[i], ifr[i].ifr_name, nob);
+               names[i][nob] = 0;
+       }
+
+        *namesp = names;
+       rc = nfound;
+        
+ out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+       PORTAL_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+       sock_release(sock);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+        int      i;
+
+        LASSERT (n > 0);
+
+        for (i = 0; i < n && names[i] != NULL; i++)
+                PORTAL_FREE(names[i], IFNAMSIZ);
+
+        PORTAL_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        mm_segment_t   oldmm = get_fs();
+       long           ticks = timeout * HZ;
+       unsigned long  then;
+       struct timeval tv;
+       
+       LASSERT (nob > 0);
+       /* Caller may pass a zero timeout if she thinks the socket buffer is
+        * empty enough to take the whole message immediately */
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+                };
+
+               if (timeout != 0) {
+                       /* Set send timeout to remaining time */
+                       tv = (struct timeval) {
+                               .tv_sec = ticks / HZ,
+                               .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                       };
+                       set_fs(KERNEL_DS);
+                       rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                                            (char *)&tv, sizeof(tv));
+                       set_fs(oldmm);
+                       if (rc != 0) {
+                               CERROR("Can't set socket send timeout "
+                                      "%ld.%06d: %d\n", 
+                                       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                               return rc;
+                       }
+               }
+
+                set_fs (KERNEL_DS);
+                rc = sock_sendmsg (sock, &msg, iov.iov_len);
+                set_fs (oldmm);
+
+               if (rc == nob)
+                       return 0;
+
+                if (rc < 0)
+                        return rc;
+
+                if (rc == 0) {
+                        CERROR ("Unexpected zero rc\n");
+                        return (-ECONNABORTED);
+                }
+
+               if (timeout == 0)
+                       return -EAGAIN;
+               
+                buffer = ((char *)buffer) + rc;
+                nob -= rc;
+        }
+
+        return (0);
+}
+
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        mm_segment_t   oldmm = get_fs();
+        long           ticks = timeout * HZ;
+        unsigned long  then;
+        struct timeval tv;
+
+        LASSERT (nob > 0);
+        LASSERT (ticks > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = 0
+                };
+
+                /* Set receive timeout to remaining time */
+                tv = (struct timeval) {
+                        .tv_sec = ticks / HZ,
+                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                };
+                set_fs(KERNEL_DS);
+                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                                     (char *)&tv, sizeof(tv));
+                set_fs(oldmm);
+                if (rc != 0) {
+                        CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+                               (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                        return rc;
+                }
+
+                set_fs(KERNEL_DS);
+                then = jiffies;
+                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+                ticks -= jiffies - then;
+                set_fs(oldmm);
+
+                if (rc < 0)
+                        return rc;
+
+                if (rc == 0)
+                        return -ECONNABORTED;
+
+                buffer = ((char *)buffer) + rc;
+                nob -= rc;
+
+                if (nob == 0)
+                        return 0;
+
+                if (ticks <= 0)
+                        return -ETIMEDOUT;
+        }
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal, 
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        struct socket      *sock;
+        int                 rc;
+        int                 option;
+        mm_segment_t        oldmm = get_fs();
+
+        /* All errors are fatal except bind failure if the port is in use */
+       *fatal = 1;
+
+        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+        *sockp = sock;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (rc);
+        }
+
+        set_fs (KERNEL_DS);
+        option = 1;
+        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                             (char *)&option, sizeof (option));
+        set_fs (oldmm);
+        if (rc != 0) {
+                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+                goto failed;
+        }
+
+        /* can't specify a local port without a local IP */
+        LASSERT (local_ip == 0 || local_port != 0);
+
+        if (local_ip != 0 || local_port != 0) {
+                memset(&locaddr, 0, sizeof(locaddr));
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons(local_port);
+                locaddr.sin_addr.s_addr = (local_ip == 0) ? 
+                                          INADDR_ANY : htonl(local_ip);
+                
+                rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, 
+                                     sizeof(locaddr));
+                if (rc == -EADDRINUSE) {
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto failed;
+                }
+                if (rc != 0) {
+                        CERROR("Error trying to bind to port %d: %d\n",
+                               local_port, rc);
+                        goto failed;
+                }
+        }
+        
+       return 0;
+
+ failed:
+        sock_release(sock);
+        return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+        mm_segment_t        oldmm = get_fs();
+        int                 option;
+        int                 rc;
+
+        if (txbufsize != 0) {
+                option = txbufsize;
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                                     (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n", 
+                                option, rc);
+                        return (rc);
+                }
+        }
+        
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n", 
+                                option, rc);
+                        return (rc);
+                }
+        }
+        
+        return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+        mm_segment_t        oldmm = get_fs();
+        int                 option;
+        int                 optlen;
+        int                 rc;
+
+        if (txbufsize != NULL) {
+                optlen = sizeof(option);
+                set_fs (KERNEL_DS);
+                rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                                     (char *)&option, &optlen);
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't get send buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *txbufsize = option;
+        }
+        
+        if (rxbufsize != NULL) {
+                optlen = sizeof(option);
+                set_fs (KERNEL_DS);
+                rc = sock_getsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, &optlen);
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't get receive buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *rxbufsize = option;
+        }
+        
+        return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp, 
+                    __u32 local_ip, int local_port, int backlog)
+{
+        int      fatal;
+       int      rc;
+
+        rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+        if (rc != 0) {
+                if (!fatal)
+                        CERROR("Can't create socket: port %d already in use\n",
+                               local_port);
+                return rc;
+        }
+        
+       rc = (*sockp)->ops->listen(*sockp, backlog);
+       if (rc == 0)
+               return 0;
+       
+       CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+       sock_release(*sockp);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock, int bufsize)
+{
+       wait_queue_t   wait;
+       struct socket *newsock;
+       int            rc;
+
+       init_waitqueue_entry(&wait, current);
+
+       newsock = sock_alloc();
+       if (newsock == NULL) {
+               CERROR("Can't allocate socket\n");
+               return -ENOMEM;
+       }
+
+       /* XXX this should add a ref to sock->ops->owner, if
+        * TCP could be a module */
+       newsock->type = sock->type;
+       newsock->ops = sock->ops;
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(sock->sk->sk_sleep, &wait);
+       
+       rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       if (rc == -EAGAIN) {
+               /* Nothing ready, so wait for activity */
+               schedule();
+               rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       }
+       
+       remove_wait_queue(sock->sk->sk_sleep, &wait);
+       set_current_state(TASK_RUNNING);
+
+       if (rc != 0)
+               goto failed;
+
+        if (bufsize != 0) {
+                rc = libcfs_sock_setbuf(newsock, bufsize, bufsize);
+                if (rc != 0)
+                        goto failed;
+        }
+
+       *newsockp = newsock;
+       return 0;
+
+ failed:
+       sock_release(newsock);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+       wake_up_all(sock->sk->sk_sleep);
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal, int bufsize,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        struct sockaddr_in  srvaddr;
+        int                 rc;
+
+       rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+       if (rc != 0)
+               return rc;
+
+        if (bufsize != 0) {
+                rc = libcfs_sock_setbuf(*sockp, bufsize, bufsize);
+                if (rc != 0)
+                        goto failed;
+        }
+
+        memset (&srvaddr, 0, sizeof (srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(peer_port);
+        srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+        rc = (*sockp)->ops->connect(*sockp,
+                                   (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                   0);
+        if (rc == 0)
+                return 0;
+
+        /* EADDRNOTAVAIL probably means we're already connected to the same
+         * peer/port on the same local port on a differently typed
+         * connection.  Let our caller retry with a different local
+         * port... */
+        *fatal = !(rc == -EADDRNOTAVAIL);
+
+        CDEBUG(*fatal ? D_ERROR : D_NET,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+ failed:
+       sock_release(*sockp);
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+       sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
+
+void
+libcfs_pause (cfs_duration_t ticks)
+{
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule_timeout(ticks);
+}
+
+EXPORT_SYMBOL(libcfs_pause);
index 2156f87..9b84b66 100644 (file)
@@ -310,7 +310,7 @@ libcfs_net2str(__u32 net)
        char           *str = libcfs_next_nidstring();
 
         if (nf == NULL) 
-                snprintf(str, PTL_NALFMT_SIZE, "t<%u>%u", nal, num);
+                snprintf(str, PTL_NALFMT_SIZE, "<%u:%u>", nal, num);
         else if (num == 0)
                 snprintf(str, PTL_NALFMT_SIZE, "%s", nf->nf_name);
         else
@@ -337,7 +337,7 @@ libcfs_nid2str(ptl_nid_t nid)
        str = libcfs_next_nidstring();
 
         if (nf == NULL)
-                snprintf(str, PTL_NALFMT_SIZE, "%x@t<%u>%u", addr, nal, nnum);
+                snprintf(str, PTL_NALFMT_SIZE, "%x@<%u:%u>", addr, nal, nnum);
         else {
                 nf->nf_addr2str(addr, str);
                 nob = strlen(str);
@@ -366,10 +366,9 @@ libcfs_str2net_internal(char *str, __u32 *net)
                 if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
                         break;
         }
-        if (i == libcfs_nnalstrfns) {
-                CWARN("No base: %s\n", str);
+
+        if (i == libcfs_nnalstrfns)
                 return NULL;
-        }
         
         nob = strlen(nf->nf_name);
 
index d59e2fd..631a7c6 100644 (file)
@@ -726,3 +726,77 @@ ptl_parse_routes (char *routes)
        LASSERT (ptl_tbnob == 0);
        return rc;
 }
+
+#ifdef __KERNEL__
+ptl_err_t
+ptl_set_ip_niaddr (ptl_ni_t *ni) 
+{
+        __u32  net = PTL_NIDNET(ni->ni_nid);
+        char **names;
+        int    n;
+        __u32  ip;
+        __u32  netmask;
+        int    up;
+        int    i;
+        int    rc;
+
+        /* Convenience for NALs that use the IP address of a local interface as
+         * the local address part of their NID */
+
+        if (ni->ni_interfaces[0] != NULL) {
+
+                CLASSERT (PTL_MAX_INTERFACES > 1);
+
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Net %s doesn't support multiple interfaces\n",
+                               libcfs_net2str(net));
+                        return PTL_FAIL;
+                }
+                
+                rc = libcfs_ipif_query(ni->ni_interfaces[0],
+                                       &up, &ip, &netmask);
+                if (rc != 0) {
+                        CERROR("Net %s can't qeury interface %s: %d\n",
+                               libcfs_net2str(net), ni->ni_interfaces[0], rc);
+                        return PTL_FAIL;
+                }
+                
+                ni->ni_nid = PTL_MKNID(net, ip);
+                return PTL_OK;
+        }
+
+        n = libcfs_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Net %s can't enumerate interfaces: %d\n", 
+                       libcfs_net2str(net), n);
+                return 0;
+        }
+
+        for (i = 0; i < n; i++) {
+                rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+                
+                if (rc != 0) {
+                        CWARN("Net %s can't query interface %s: %d\n",
+                              libcfs_net2str(net), names[i], rc);
+                        continue;
+                }
+                        
+                if (!up) {
+                        CWARN("Net %s ignoring interface %s (down)\n",
+                              libcfs_net2str(net), names[i]);
+                        continue;
+                }
+
+                libcfs_ipif_free_enumeration(names, n);
+                ni->ni_nid = PTL_MKNID(net, ip);
+                return PTL_OK;
+        }
+
+        CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+        libcfs_ipif_free_enumeration(names, n);
+        return PTL_FAIL;
+}
+
+EXPORT_SYMBOL(ptl_set_ip_niaddr);
+
+#endif
index 6c60eab..4543da9 100644 (file)
@@ -349,6 +349,7 @@ int jt_ptl_network(int argc, char **argv)
                         
                         if (net == PTL_NIDNET(data.ioc_nid)) {
                                 g_net_set = 1;
+                                g_net = net;
                                 return 0;
                         }
                         continue;
@@ -373,8 +374,8 @@ int jt_ptl_network(int argc, char **argv)
                 return -1;
         }
         
-        fprintf(stderr,"%s not a local network (%s on its own to list)\n",
-                argv[1]);
+        fprintf(stderr,"%s not a local network (%s on its own to list them all)\n",
+                argv[1], argv[0]);
         return -1;
 #endif
 }
@@ -527,21 +528,22 @@ jt_ptl_print_peers (int argc, char **argv)
                         break;
 
                 if (g_net_is_compatible(NULL, SOCKNAL, 0))
-                        printf (LPX64"[%d]%s@%s:%d #%d\n",
-                                data.ioc_nid
+                        printf ("%-20s [%d]%s->%s:%d #%d\n",
+                                libcfs_nid2str(data.ioc_nid)
                                 data.ioc_count, /* persistence */
                                 ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* my ip */
                                 ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */
                                 data.ioc_u32[1], /* peer port */
                                 data.ioc_u32[3]); /* conn_count */
                 else if (g_net_is_compatible(NULL, RANAL, OPENIBNAL, VIBNAL, 0))
-                        printf (LPX64"[%d]@%s:%d\n",
-                                data.ioc_nid, data.ioc_count,
+                        printf ("%-20s [%d]@%s:%d\n",
+                                libcfs_nid2str(data.ioc_nid), 
+                                data.ioc_count,
                                 ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */
                                 data.ioc_u32[1]); /* peer port */
                 else
-                        printf (LPX64"[%d]\n",
-                                data.ioc_nid, data.ioc_count);
+                        printf ("%-20s [%d]\n",
+                                libcfs_nid2str(data.ioc_nid), data.ioc_count);
         }
 
         if (index == 0) {
@@ -689,32 +691,31 @@ jt_ptl_print_connections (int argc, char **argv)
                 PORTAL_IOC_INIT(data);
                 data.ioc_net     = g_net;
                 data.ioc_count   = index;
-                
+
                 rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_CONN, &data);
                 if (rc != 0)
                         break;
 
                 if (g_net_is_compatible (NULL, SOCKNAL, 0))
-                        printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n",
-                                data.ioc_u32[4], /* scheduler */
-                                ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */
-                                data.ioc_nid, 
-                                ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */
-                                data.ioc_u32[1],         /* remote port */
+                        printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n",
+                                libcfs_nid2str(data.ioc_nid),
                                 (data.ioc_u32[3] == SOCKNAL_CONN_ANY) ? "A" :
                                 (data.ioc_u32[3] == SOCKNAL_CONN_CONTROL) ? "C" :
                                 (data.ioc_u32[3] == SOCKNAL_CONN_BULK_IN) ? "I" :
                                 (data.ioc_u32[3] == SOCKNAL_CONN_BULK_OUT) ? "O" : "?",
+                                data.ioc_u32[4], /* scheduler */
+                                ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */
+                                ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */
+                                data.ioc_u32[1],         /* remote port */
                                 data.ioc_count, /* tx buffer size */
                                 data.ioc_u32[5], /* rx buffer size */
                                 data.ioc_flags ? "nagle" : "nonagle");
                 else if (g_net_is_compatible (NULL, RANAL, 0))
-                        printf ("[%d]"LPX64"@%s:%d\n",
-                                data.ioc_u32[0], /* device id */
-                                data.ioc_nid);
+                        printf ("%-20s [%d]\n",
+                                libcfs_nid2str(data.ioc_nid),
+                                data.ioc_u32[0] /* device id */);
                 else
-                        printf (LPX64"\n",
-                                data.ioc_nid);
+                        printf ("%s\n", libcfs_nid2str(data.ioc_nid));
         }
 
         if (index == 0) {