Whamcloud - gitweb
* brought openibnal into the newconfig fold
authoreeb <eeb>
Mon, 6 Jun 2005 21:22:25 +0000 (21:22 +0000)
committereeb <eeb>
Mon, 6 Jun 2005 21:22:25 +0000 (21:22 +0000)
*  moved socket address query function out of NALs and into libcfs

lnet/klnds/openiblnd/Makefile.in
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/openiblnd/openiblnd.h
lnet/klnds/openiblnd/openiblnd_cb.c
lnet/klnds/openiblnd/openiblnd_modparams.c [new file with mode: 0644]

index 9b8ed5d..a6eb048 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := kopenibnal
-kopenibnal-objs := openibnal.o openibnal_cb.o
+kopenibnal-objs := openibnal.o openibnal_cb.o openibnal_modparams.o
 
 EXTRA_POST_CFLAGS := @OPENIBCPPFLAGS@
 
index 432f937..f618339 100644 (file)
@@ -35,37 +35,6 @@ ptl_nal_t               kibnal_nal = {
 };
 
 kib_data_t              kibnal_data;
-kib_tunables_t          kibnal_tunables;
-
-#define IBNAL_SYSCTL             202
-
-enum {
-        IBNAL_SYSCTL_TIMEOUT=1,
-        IBNAL_SYSCTL_LISTENER_TIMEOUT,
-        IBNAL_SYSCTL_BACKLOG,
-        IBNAL_SYSCTL_PORT
-};
-
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", 
-         &kibnal_tunables.kib_listener_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_BACKLOG, "backlog",
-         &kibnal_tunables.kib_backlog, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        {IBNAL_SYSCTL_PORT, "port",
-         &kibnal_tunables.kib_port, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        { 0 }
-};
-
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
 
 __u32 
 kibnal_cksum (void *ptr, int nob)
@@ -102,10 +71,11 @@ kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
         msg->ibm_dstnid   = dstnid;
         msg->ibm_dststamp = dststamp;
-#if IBNAL_CKSUM
-        /* NB ibm_cksum zero while computing cksum */
-        msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
+        }
 }
 
 int
@@ -244,216 +214,34 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kibnal_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        struct iovec  iov = {
-                .iov_base = buffer,
-                .iov_len  = nob
-        };
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &iov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = MSG_DONTWAIT
-        };
-
-        /* We've set up the socket's send buffer to be large enough for
-         * everything we send, so a single non-blocking send should
-         * complete without error. */
-
-        set_fs(KERNEL_DS);
-        rc = sock_sendmsg(sock, &msg, iov.iov_len);
-        set_fs(oldmm);
-
-        if (rc == nob)
-                return 0;
-
-        if (rc >= 0)
-                return -EAGAIN;
-
-        return rc;
-}
-
-int
-kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
-{
-        int            rc;
-        mm_segment_t   oldmm = get_fs();
-        long           ticks = timeout * HZ;
-        unsigned long  then;
-        struct timeval tv;
-
-        LASSERT (nob > 0);
-        LASSERT (ticks > 0);
-
-        for (;;) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                /* Set receive timeout to remaining time */
-                tv = (struct timeval) {
-                        .tv_sec = ticks / HZ,
-                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
-                };
-                set_fs(KERNEL_DS);
-                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                     (char *)&tv, sizeof(tv));
-                set_fs(oldmm);
-                if (rc != 0) {
-                        CERROR("Can't set socket recv timeout %d: %d\n",
-                               timeout, rc);
-                        return rc;
-                }
-
-                set_fs(KERNEL_DS);
-                then = jiffies;
-                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
-                ticks -= jiffies - then;
-                set_fs(oldmm);
-
-                if (rc < 0)
-                        return rc;
-
-                if (rc == 0)
-                        return -ECONNABORTED;
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-
-                if (nob == 0)
-                        return 0;
-
-                if (ticks <= 0)
-                        return -ETIMEDOUT;
-        }
-}
-
-int
-kibnal_create_sock(struct socket **sockp)
-{
-        struct socket       *sock;
-        int                  rc;
-        int                  option;
-        mm_segment_t         oldmm = get_fs();
-
-        rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR("Can't create socket: %d\n", rc);
-                return rc;
-        }
-
-        /* Ensure sends will not block */
-        option = 2 * sizeof(kib_msg_t);
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set send buffer %d: %d\n", option, rc);
-                goto failed;
-        }
-
-        option = 1;
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR: %d\n", rc);
-                goto failed;
-        }
-
-        *sockp = sock;
-        return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-void
-kibnal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
-
-int
 kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp)
 {
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
         unsigned int        port;
         int                 rc;
+        int                 fatal;
 
         for (port = 1023; port >= 512; port--) {
 
-                memset(&locaddr, 0, sizeof(locaddr)); 
-                locaddr.sin_family      = AF_INET; 
-                locaddr.sin_port        = htons(port);
-                locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-
-                memset (&srvaddr, 0, sizeof (srvaddr));
-                srvaddr.sin_family      = AF_INET;
-                srvaddr.sin_port        = htons (peer->ibp_port);
-                srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip);
-
-                rc = kibnal_create_sock(&sock);
-                if (rc != 0)
-                        return rc;
-
-                rc = sock->ops->bind(sock,
-                                     (struct sockaddr *)&locaddr, sizeof(locaddr));
-                if (rc != 0) {
-                        sock_release(sock);
-                        
-                        if (rc == -EADDRINUSE) {
-                                CDEBUG(D_NET, "Port %d already in use\n", port);
-                                continue;
-                        }
-
-                        CERROR("Can't bind to reserved port %d: %d\n", port, rc);
-                        return rc;
-                }
-
-                rc = sock->ops->connect(sock,
-                                        (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                        0);
-                if (rc == 0) {
-                        *sockp = sock;
+                rc = libcfs_sock_connect(sockp, &fatal,
+                                         2 * sizeof(kib_msg_t),
+                                         0, port,
+                                         peer->ibp_ip, peer->ibp_port);
+                if (rc == 0)
                         return 0;
-                }
                 
-                sock_release(sock);
-
-                if (rc != -EADDRNOTAVAIL) {
-                        CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
-                               port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
-                        return rc;
+                if (!fatal) {
+                        CDEBUG(D_NET, "Port %d already in use\n", port);
+                        continue;
                 }
                 
-                CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", 
-                       port, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
+                       port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
+                return rc;
         }
 
         /* all ports busy */
+        CERROR("Can't connect to %u.%u.%u.%u/%d: all ports busy\n",
+               HIPQUAD(peer->ibp_ip), peer->ibp_port);
         return -EHOSTUNREACH;
 }
 
@@ -476,7 +264,7 @@ kibnal_make_svcqry (kib_conn_t *conn)
         if (rc != 0)
                 return rc;
         
-        rc = kibnal_sock_write(sock, msg, msg->ibm_nob);
+        rc = libcfs_sock_write(sock, msg, msg->ibm_nob, 0);
         if (rc != 0) {
                 CERROR("Error %d sending svcqry to "
                        LPX64"@%u.%u.%u.%u/%d\n", rc, 
@@ -485,7 +273,7 @@ kibnal_make_svcqry (kib_conn_t *conn)
         }
 
         nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp);
-        rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout);
+        rc = libcfs_sock_read(sock, msg, nob, *kibnal_tunables.kib_timeout);
         if (rc != 0) {
                 CERROR("Error %d receiving svcrsp from "
                        LPX64"@%u.%u.%u.%u/%d\n", rc, 
@@ -530,32 +318,26 @@ kibnal_make_svcqry (kib_conn_t *conn)
         conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
  out:
-        sock_release(sock);
+        libcfs_sock_release(sock);
         return rc;
 }
 
 void
 kibnal_handle_svcqry (struct socket *sock)
 {
-        struct sockaddr_in   addr;
         __u32                peer_ip;
         unsigned int         peer_port;
         kib_msg_t           *msg;
         __u64                srcnid;
         __u64                srcstamp;
-        int                  len;
         int                  rc;
 
-        len = sizeof(addr);
-        rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
         if (rc != 0) {
                 CERROR("Can't get peer's IP: %d\n", rc);
                 return;
         }
 
-        peer_ip = ntohl(addr.sin_addr.s_addr);
-        peer_port = ntohs(addr.sin_port);
-
         if (peer_port >= 1024) {
                 CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
                        HIPQUAD(peer_ip), peer_port);
@@ -569,8 +351,8 @@ kibnal_handle_svcqry (struct socket *sock)
                 goto out;
         }
         
-        rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
-                              kibnal_tunables.kib_listener_timeout);
+        rc = libcfs_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
+                              *kibnal_tunables.kib_listener_timeout);
         if (rc != 0) {
                 CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
@@ -610,7 +392,7 @@ kibnal_handle_svcqry (struct socket *sock)
 
         kibnal_pack_msg(msg, 0, srcnid, srcstamp);
         
-        rc = kibnal_sock_write (sock, msg, msg->ibm_nob);
+        rc = libcfs_sock_write (sock, msg, msg->ibm_nob, 0);
         if (rc != 0) {
                 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
@@ -624,15 +406,13 @@ kibnal_handle_svcqry (struct socket *sock)
 void
 kibnal_free_acceptsock (kib_acceptsock_t *as)
 {
-        sock_release(as->ibas_sock);
+        libcfs_sock_release(as->ibas_sock);
         PORTAL_FREE(as, sizeof(*as));
 }
 
 int
 kibnal_ip_listener(void *arg)
 {
-        struct sockaddr_in addr;
-        wait_queue_t       wait;
         struct socket     *sock;
         kib_acceptsock_t  *as;
         int                port;
@@ -640,37 +420,15 @@ kibnal_ip_listener(void *arg)
         int                rc;
         unsigned long      flags;
 
-        /* Parent thread holds kib_nid_mutex, and is, or is about to
-         * block on kib_listener_signal */
-
-        port = kibnal_tunables.kib_port;
+        port = *kibnal_tunables.kib_port;
         snprintf(name, sizeof(name), "kibnal_lstn%03d", port);
         kportal_daemonize(name);
         kportal_blockallsigs();
 
-        init_waitqueue_entry(&wait, current);
-
-        rc = kibnal_create_sock(&sock);
+        rc = libcfs_sock_listen(&sock, 0, port,
+                                *kibnal_tunables.kib_backlog);
         if (rc != 0)
-                goto out_0;
-
-        memset(&addr, 0, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_port        = htons(port);
-        addr.sin_addr.s_addr = INADDR_ANY;
-
-        rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
-        if (rc != 0) {
-                CERROR("Can't bind to port %d\n", port);
-                goto out_1;
-        }
-
-        rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog);
-        if (rc != 0) {
-                CERROR("Can't set listen backlog %d: %d\n", 
-                       kibnal_tunables.kib_backlog, rc);
-                goto out_1;
-        }
+                goto out;
 
         LASSERT (kibnal_data.kib_listener_sock == NULL);
         kibnal_data.kib_listener_sock = sock;
@@ -679,8 +437,6 @@ kibnal_ip_listener(void *arg)
         LASSERT (kibnal_data.kib_listener_shutdown == 0);
         up(&kibnal_data.kib_listener_signal);
 
-        /* Wake me any time something happens on my socket */
-        add_wait_queue(sock->sk->sk_sleep, &wait);
         as = NULL;
 
         while (kibnal_data.kib_listener_shutdown == 0) {
@@ -689,67 +445,39 @@ kibnal_ip_listener(void *arg)
                         PORTAL_ALLOC(as, sizeof(*as));
                         if (as == NULL) {
                                 CERROR("Out of Memory: pausing...\n");
-                                kibnal_pause(HZ);
+                                libcfs_pause(HZ);
                                 continue;
                         }
                         as->ibas_sock = NULL;
                 }
 
-                if (as->ibas_sock == NULL) {
-                        as->ibas_sock = sock_alloc();
-                        if (as->ibas_sock == NULL) {
-                                CERROR("Can't allocate socket: pausing...\n");
-                                kibnal_pause(HZ);
-                                continue;
-                        }
-                        /* XXX this should add a ref to sock->ops->owner, if
-                         * TCP could be a module */
-                        as->ibas_sock->type = sock->type;
-                        as->ibas_sock->ops = sock->ops;
+                rc = libcfs_sock_accept(&as->ibas_sock, sock,
+                                        2 * sizeof(kib_msg_t));
+                if (rc != 0) {
+                        if (rc != -EAGAIN) {
+                                CERROR("Accept failed: %d, pausing...\n", rc);
+                                libcfs_pause(cfs_time_seconds(1));
+                        }                                                        
+                        continue;
                 }
                 
-                set_current_state(TASK_INTERRUPTIBLE);
-
-                rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK);
-
-                /* Sleep for socket activity? */
-                if (rc == -EAGAIN &&
-                    kibnal_data.kib_listener_shutdown == 0)
-                        schedule();
-
-                set_current_state(TASK_RUNNING);
-
-                if (rc == 0) {
-                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-                        
-                        list_add_tail(&as->ibas_list, 
-                                      &kibnal_data.kib_connd_acceptq);
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                
+                list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq);
 
-                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-                        wake_up(&kibnal_data.kib_connd_waitq);
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+                wake_up(&kibnal_data.kib_connd_waitq);
 
-                        as = NULL;
-                        continue;
-                }
-                
-                if (rc != -EAGAIN) {
-                        CERROR("Accept failed: %d, pausing...\n", rc);
-                        kibnal_pause(HZ);
-                }
+                as = NULL;
         }
 
-        if (as != NULL) {
-                if (as->ibas_sock != NULL)
-                        sock_release(as->ibas_sock);
+        if (as != NULL)
                 PORTAL_FREE(as, sizeof(*as));
-        }
 
         rc = 0;
-        remove_wait_queue(sock->sk->sk_sleep, &wait);
- out_1:
-        sock_release(sock);
+        libcfs_sock_release(sock);
         kibnal_data.kib_listener_sock = NULL;
- out_0:
+ out:
         /* set completion status and unblock thread waiting for me 
          * (parent on startup failure, executioner on normal shutdown) */
         kibnal_data.kib_listener_shutdown = rc;
@@ -782,16 +510,12 @@ kibnal_start_ip_listener (void)
         rc = kibnal_data.kib_listener_shutdown;
         LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL));
 
-        CDEBUG((rc == 0) ? D_WARNING : D_ERROR, 
-               "Listener %s: pid:%ld port:%d backlog:%d\n", 
-               (rc == 0) ? "started OK" : "startup failed",
-               pid, kibnal_tunables.kib_port, kibnal_tunables.kib_backlog);
-
+        CDEBUG((rc == 0) ? D_NET : D_ERROR, "Listener startup rc: %d\n", rc);
         return rc;
 }
 
 void
-kibnal_stop_ip_listener(int clear_acceptq)
+kibnal_stop_ip_listener(void)
 {
         struct list_head  zombie_accepts;
         kib_acceptsock_t *as;
@@ -803,73 +527,13 @@ kibnal_stop_ip_listener(int clear_acceptq)
         LASSERT (kibnal_data.kib_listener_sock != NULL);
 
         kibnal_data.kib_listener_shutdown = 1;
-        wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep);
+        libcfs_sock_abort_accept(kibnal_data.kib_listener_sock);
 
         /* Block until listener has torn down. */
         down(&kibnal_data.kib_listener_signal);
 
         LASSERT (kibnal_data.kib_listener_sock == NULL);
         CDEBUG(D_WARNING, "Listener stopped\n");
-
-        if (!clear_acceptq)
-                return;
-
-        /* Close any unhandled accepts */
-        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-
-        list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq);
-        list_del_init(&kibnal_data.kib_connd_acceptq);
-
-        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-        
-        while (!list_empty(&zombie_accepts)) {
-                as = list_entry(zombie_accepts.next,
-                                kib_acceptsock_t, ibas_list);
-                list_del(&as->ibas_list);
-                kibnal_free_acceptsock(as);
-        }
-}
-
-int 
-kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp)
-{
-        int   *tunable = (int *)table->data;
-        int    old_val;
-        int    rc;
-
-        /* No race with nal initialisation since the nal is setup all the time
-         * it's loaded.  When that changes, change this! */
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-
-        down(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (tunable == &kibnal_tunables.kib_port ||
-                 tunable == &kibnal_tunables.kib_backlog);
-        old_val = *tunable;
-
-        rc = proc_dointvec(table, write, filp, buffer, lenp);
-
-        if (write &&
-            (*tunable != old_val ||
-             kibnal_data.kib_listener_sock == NULL)) {
-
-                if (kibnal_data.kib_listener_sock != NULL)
-                        kibnal_stop_ip_listener(0);
-
-                rc = kibnal_start_ip_listener();
-                if (rc != 0) {
-                        CERROR("Unable to restart listener with new tunable:"
-                               " reverting to old value\n");
-                        *tunable = old_val;
-                        kibnal_start_ip_listener();
-                }
-        }
-
-        up(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-        return rc;
 }
 
 int
@@ -929,77 +593,16 @@ kibnal_stop_ib_listener (void)
 }
 
 int
-kibnal_set_mynid (ptl_nid_t nid)
-{
-        int               rc;
-
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, kibnal_data.kib_ni->ni_nid);
-
-        down (&kibnal_data.kib_nid_mutex);
-
-        if (nid == kibnal_data.kib_ni->ni_nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
-
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               kibnal_data.kib_ni->ni_nid, nid);
-
-        if (kibnal_data.kib_listener_sock != NULL)
-                kibnal_stop_ip_listener(1);
-        
-        if (kibnal_data.kib_listen_handle != NULL)
-                kibnal_stop_ib_listener();
-
-        kibnal_data.kib_ni->ni_nid = nid;
-        kibnal_data.kib_incarnation++;
-        mb();
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave new
-         * world. */
-        kibnal_del_peer (PTL_NID_ANY, 0);
-
-        if (kibnal_data.kib_ni->ni_nid != PTL_NID_ANY) {
-                /* got a new NID to install */
-                rc = kibnal_start_ib_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IB listener: %d\n", rc);
-                        goto failed_0;
-                }
-        
-                rc = kibnal_start_ip_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IP listener: %d\n", rc);
-                        goto failed_1;
-                }
-        }
-        
-        up(&kibnal_data.kib_nid_mutex);
-        return 0;
-
- failed_1:
-        kibnal_stop_ib_listener();
- failed_0:
-        kibnal_data.kib_ni->ni_nid = PTL_NID_ANY;
-        kibnal_data.kib_incarnation++;
-        mb();
-        kibnal_del_peer (PTL_NID_ANY, 0);
-        up(&kibnal_data.kib_nid_mutex);
-        return rc;
-}
-
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
+kibnal_create_peer (kib_peer_t **peerp, ptl_nid_t nid)
 {
-        kib_peer_t *peer;
+        kib_peer_t    *peer;
+        unsigned long  flags;
 
         LASSERT (nid != PTL_NID_ANY);
 
         PORTAL_ALLOC (peer, sizeof (*peer));
         if (peer == NULL)
-                return (NULL);
+                return -ENOMEM;
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -1012,17 +615,33 @@ kibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
 
         peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_reconnect_interval = *kibnal_tunables.kib_min_reconnect_interval;
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        
+        if (kibnal_data.kib_listener_shutdown) {
+                /* shutdown has started already */
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                
+                PORTAL_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
+        
+        kibnal_data.kib_npeers++;
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        return (peer);
+        CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
+        *peerp = peer;
+        return 0;
 }
 
 void
 kibnal_destroy_peer (kib_peer_t *peer)
 {
+        unsigned long flags;
+
         CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
@@ -1039,7 +658,9 @@ kibnal_destroy_peer (kib_peer_t *peer)
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&kibnal_data.kib_npeers);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        kibnal_data.kib_npeers--;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 }
 
 void
@@ -1152,13 +773,14 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
         unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        int                rc;
         
         if (nid == PTL_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer (&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
@@ -1617,10 +1239,15 @@ kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg)
                 break;
         }
         case IOC_PORTAL_REGISTER_MYNID: {
-                if (data->ioc_nid == PTL_NID_ANY)
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_PORTAL_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (data->ioc_nid);
+                }
                 break;
         }
         }
@@ -1729,14 +1356,14 @@ kibnal_setup_tx_descs (void)
         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
-                                IBNAL_TX_MSG_PAGES, 
+                                IBNAL_TX_MSG_PAGES()
                                 0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
@@ -1744,7 +1371,7 @@ kibnal_setup_tx_descs (void)
                 
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= IBNAL_NTX);
+                tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
@@ -1758,7 +1385,7 @@ kibnal_setup_tx_descs (void)
                                   &kibnal_data.kib_idle_txs);
 
                 vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -1766,7 +1393,7 @@ kibnal_setup_tx_descs (void)
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
@@ -1776,8 +1403,9 @@ kibnal_setup_tx_descs (void)
 void
 kibnal_shutdown (ptl_ni_t *ni)
 {
-        int   i;
-        int   rc;
+        int           i;
+        int           rc;
+        unsigned long flags;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read (&portal_kmemory));
@@ -1791,21 +1419,49 @@ kibnal_shutdown (ptl_ni_t *ni)
                 LBUG();
 
         case IBNAL_INIT_ALL:
-                /* resetting my NID unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kibnal_set_mynid (PTL_NID_ANY);
+                /* Stop listeners and prevent new peers from being created */
+                kibnal_stop_ip_listener();
+                /* fall through */
+
+        case IBNAL_INIT_IB:
+                kibnal_stop_ib_listener();
+
+                /* Remove all existing peers from the peer table */
+                kibnal_del_peer(PTL_NID_ANY);
+                
+                /* Wait for pending conn reqs to be handled */
+                i = 2;
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                while (!list_empty(&kibnal_data.kib_connd_acceptq)) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, 
+                                               flags);
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for conn reqs to clean up\n");
+                        libcfs_pause(cfs_time_seconds(1));
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                }
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                while (kibnal_data.kib_npeers != 0) {
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+                                                flags);
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&kibnal_data.kib_npeers));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                               kibnal_data.kib_npeers);
+                        libcfs_pause(cfs_time_seconds(1));
+
+                        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
                 }
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+                /* fall through */
 
+        case IBNAL_INIT_CQ:
                 rc = ib_cq_destroy (kibnal_data.kib_cq);
                 if (rc != 0)
                         CERROR ("Destroy CQ error: %d\n", rc);
@@ -1830,7 +1486,7 @@ kibnal_shutdown (ptl_ni_t *ni)
         case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (kibnal_data.kib_npeers == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
@@ -1854,8 +1510,7 @@ kibnal_shutdown (ptl_ni_t *ni)
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
                                atomic_read (&kibnal_data.kib_nthreads));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                        libcfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
                 
@@ -1865,7 +1520,7 @@ kibnal_shutdown (ptl_ni_t *ni)
 
         if (kibnal_data.kib_tx_descs != NULL)
                 PORTAL_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+                             IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 
         if (kibnal_data.kib_peers != NULL)
                 PORTAL_FREE (kibnal_data.kib_peers,
@@ -1897,8 +1552,8 @@ kibnal_startup (ptl_ni_t *ni)
                 return PTL_FAIL;
         }
 
-        if (ni->ni_interfaces[0] != NULL) {
-                CERROR("Explicit interface config not supported\n");
+        if (ptl_set_ip_niaddr(ni) != PTL_OK) {
+                CERROR("Can't determine my NID\n");
                 return PTL_FAIL;
         }
         
@@ -1945,7 +1600,7 @@ kibnal_startup (ptl_ni_t *ni)
         init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
         PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
         if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
@@ -1965,7 +1620,7 @@ kibnal_startup (ptl_ni_t *ni)
                 }
         }
 
-        for (i = 0; i < IBNAL_N_CONND; i++) {
+        for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) {
                 rc = kibnal_thread_start (kibnal_connd,
                                           (void *)((unsigned long)i));
                 if (rc != 0) {
@@ -2024,7 +1679,8 @@ kibnal_startup (ptl_ni_t *ni)
         /*****************************************************/
 #if IBNAL_FMR
         {
-                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                const int pool_size = *kibnal_tunables.kib_ntx + 
+                                      *kibnal_tunables.kib_ntx_nblk;
                 struct ib_fmr_pool_param params = {
                         .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
@@ -2069,7 +1725,7 @@ kibnal_startup (ptl_ni_t *ni)
                         },
                         .arg            = NULL,
                 };
-                int  nentries = IBNAL_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES();
                 
                 rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
@@ -2083,6 +1739,22 @@ kibnal_startup (ptl_ni_t *ni)
                 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
+
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
+        /*****************************************************/
+
+        rc = kibnal_start_ib_listener();
+        if (rc != 0)
+                goto failed;
+        
+        /* flag IB listener initialised */
+        kibnal_data.kib_init = IBNAL_INIT_IB;
+        /*****************************************************/
+
+        rc = kibnal_start_ip_listener();
+        if (rc != 0)
+                goto failed;
         
         /* flag everything initialised */
         kibnal_data.kib_init = IBNAL_INIT_ALL;
@@ -2101,10 +1773,8 @@ kibnal_startup (ptl_ni_t *ni)
 void __exit
 kibnal_module_fini (void)
 {
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-
         ptl_unregister_nal(&kibnal_nal);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -2112,33 +1782,17 @@ kibnal_module_init (void)
 {
         int    rc;
 
-        /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int));
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
-        kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT;
-        kibnal_tunables.kib_backlog = IBNAL_BACKLOG;
-        kibnal_tunables.kib_port = IBNAL_PORT;
-
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
+        
         ptl_register_nal(&kibnal_nal);
 
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-        if (kibnal_tunables.kib_sysctl == NULL) {
-                CERROR("Can't register sysctl table\n");
-                ptl_unregister_nal(&kibnal_nal);
-                return (-ENOMEM);
-        }
-
         return (0);
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
+MODULE_DESCRIPTION("Kernel OpenIB NAL v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index d922572..7f23702 100644 (file)
 #else
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
-#define IBNAL_N_CONND       4                   /* # connection daemons */
 
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
-
-#define IBNAL_MSG_SIZE           (4<<10)        /* max size of queued messages (inc hdr) */
-
-#define IBNAL_MSG_QUEUE_SIZE      8             /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER    6             /* when to eagerly return credits */
-#define IBNAL_RETRY               7             /* # times to retry */
-#define IBNAL_RNR_RETRY           7             /*  */
-#define IBNAL_CM_RETRY            7             /* # times to retry connection */
-#define IBNAL_FLOW_CONTROL        1
-#define IBNAL_RESPONDER_RESOURCES 8
-
-#define IBNAL_NTX                 64            /* # tx descs */
-#define IBNAL_NTX_NBLK            256           /* # reserved tx descs */
-
-#define IBNAL_PEER_HASH_SIZE      101           /* # peer lists */
-
-#define IBNAL_RESCHED             100           /* # scheduler loops before reschedule */
+#define IBNAL_FMR                    1
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
 
-#define IBNAL_CONCURRENT_PEERS    1000          /* # nodes all talking at once to me */
 
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT          50            /* default comms timeout (seconds) */
-#define IBNAL_LISTENER_TIMEOUT    5             /* default listener timeout (seconds) */
-#define IBNAL_BACKLOG             127           /* default listener backlog */
-#define IBNAL_PORT                988           /* default listener port */
+/* defaults for modparams/tunables */
+#define IBNAL_N_CONND                4          /* # connection daemons */
+#define IBNAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL 60         /* ...exponentially increasing to this (seconds) */
+#define IBNAL_CONCURRENT_PEERS       1024       /* # nodes all talking at once to me */
+#define IBNAL_CKSUM                  0          /* checksum kib_msg_t? */
+#define IBNAL_TIMEOUT                50         /* default comms timeout (seconds) */
+#define IBNAL_LISTENER_TIMEOUT       5          /* default listener timeout (seconds) */
+#define IBNAL_BACKLOG                127        /* default listener backlog */
+#define IBNAL_PORT                   986        /* default listener port */
+#define IBNAL_NTX                    64         /* # tx descs */
+#define IBNAL_NTX_NBLK               256        /* # reserved tx descs */
+
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBNAL_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER       6          /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+#define IBNAL_RDMA_BASE              0x0eeb0000
+
+/* QP tunables */
+#define IBNAL_RETRY                  7          /* # times to retry */
+#define IBNAL_RNR_RETRY              7          /*  */
+#define IBNAL_CM_RETRY               7          /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL           1
+#define IBNAL_RESPONDER_RESOURCES    8
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
-#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_TX_MSGS()       (*kibnal_tunables.kib_ntx + \
+                               *kibnal_tunables.kib_ntx_nblk)
+#define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS         IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES    (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES    ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* we may have up to 2 completions per transmit +
    1 completion per receive, per connection */
-#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
-                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-
-#define IBNAL_RDMA_BASE  0x0eeb0000
-#define IBNAL_FMR        1
-#define IBNAL_CKSUM      1
-//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_CQ_ENTRIES()  ((2*IBNAL_TX_MSGS()) +                                      \
+                             (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers))
 
 typedef struct
 {
-        int               kib_io_timeout;       /* comms timeout (seconds) */
-        int               kib_listener_timeout; /* listener's timeout */
-        int               kib_backlog;          /* listenter's accept backlog */
-        int               kib_port;             /* where the listener listens */
+        int      *kib_n_connd;                  /* # connection daemons */
+        int      *kib_min_reconnect_interval;   /* min connect retry seconds... */
+        int      *kib_max_reconnect_interval;   /* max connect retry seconds */
+        int      *kib_concurrent_peers;         /* max # peers */
+        int      *kib_cksum;                    /* checksum kib_msg_t? */
+        int      *kib_timeout;                  /* comms timeout (seconds) */
+        int      *kib_listener_timeout;         /* listener's timeout */
+        int      *kib_backlog;                  /* listenter's accept backlog */
+        int      *kib_port;                     /* where the listener listens */
+        int      *kib_ntx;                      /* # tx descs */
+        int      *kib_ntx_nblk;                 /* # reserved tx descs */
+
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
 } kib_tunables_t;
 
@@ -162,7 +169,7 @@ typedef struct
 
         struct list_head *kib_peers;            /* hash table of all my known peers */
         int               kib_peer_hash_size;   /* size of kib_peers */
-        atomic_t          kib_npeers;           /* # peers extant */
+        int               kib_npeers;           /* # peers extant */
         atomic_t          kib_nconns;           /* # connections extant */
 
         struct list_head  kib_reaper_conns;     /* connections to reap */
@@ -207,7 +214,9 @@ typedef struct
 #define IBNAL_INIT_PD              3
 #define IBNAL_INIT_FMR             4
 #define IBNAL_INIT_TXD             5
-#define IBNAL_INIT_ALL             6
+#define IBNAL_INIT_CQ              6
+#define IBNAL_INIT_IB              7
+#define IBNAL_INIT_ALL             8
 
 typedef struct kib_acceptsock                   /* accepted socket queued for connd */
 {
@@ -432,7 +441,7 @@ kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
         LASSERT (tx->tx_conn == NULL);          /* only set here */
 
         tx->tx_conn = conn;
-        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+        tx->tx_deadline = jiffies + *kibnal_tunables.kib_timeout * HZ;
         list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
 }
 
@@ -536,8 +545,9 @@ extern void kibnal_free_acceptsock (kib_acceptsock_t *as);
 extern int kibnal_listener_procint(ctl_table *table, int write, 
                                    struct file *filp, void *buffer, 
                                    size_t *lenp);
-extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern int kibnal_create_peer (kib_peer_t **peerp, ptl_nid_t nid);
 extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_add_persistent_peer(ptl_nid_t nid, __u32 ip, int port);
 extern int kibnal_del_peer (ptl_nid_t nid);
 extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
 extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
@@ -575,3 +585,6 @@ extern void kibnal_start_active_rdma (int type, int status,
                                       unsigned int niov,
                                       struct iovec *iov, ptl_kiov_t *kiov,
                                       int offset, int nob);
+
+extern int  kibnal_tunables_init(void);
+extern void kibnal_tunables_fini(void);
index 6d38da0..7812a03 100644 (file)
@@ -925,6 +925,8 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         unsigned long    flags;
         kib_peer_t      *peer;
         kib_conn_t      *conn;
+        int              retry;
+        int              rc;
         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
@@ -933,38 +935,50 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
         LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
 
-        read_lock_irqsave(g_lock, flags);
+        for (retry = 0; ; retry = 1) {
+                read_lock_irqsave(g_lock, flags);
         
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                read_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
-        }
-
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                read_unlock_irqrestore(g_lock, flags);
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL) {
+                        conn = kibnal_find_conn_locked (peer);
+                        if (conn != NULL) {
+                                CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
+                                       conn, conn->ibc_state, libcfs_nid2str(nid),
+                                       atomic_read (&conn->ibc_refcount));
+                                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                                read_unlock_irqrestore(g_lock, flags);
                 
-                kibnal_queue_tx (tx, conn);
-                return;
-        }
-        
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock(g_lock);
+                                kibnal_queue_tx (tx, conn);
+                                return;
+                        }
+                }
+                
+                /* Making one or more connections; I'll need a write lock... */
+                read_unlock(g_lock);
+                write_lock(g_lock);
 
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL)
+                        break;
+                
                 write_unlock_irqrestore (g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
+
+                if (retry) {
+                        CERROR("Can't find per %s\n", libcfs_nid2str(nid));
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+
+                rc = kibnal_add_persistent_peer(nid, PTL_NIDADDR(nid),
+                                                *kibnal_tunables.kib_port);
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        tx->tx_status = rc;
+                        kibnal_tx_done(tx);
+                        return;
+                }
         }
 
         conn = kibnal_find_conn_locked (peer);
@@ -1519,7 +1533,8 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int rc)
         unsigned long     flags;
 
         LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+        LASSERT (peer->ibp_reconnect_interval >= 
+                 *kibnal_tunables.kib_min_reconnect_interval);
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
@@ -1636,7 +1651,8 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
                 list_add (&conn->ibc_list, &peer->ibp_conns);
                 
                 /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+                peer->ibp_reconnect_interval =
+                        *kibnal_tunables.kib_min_reconnect_interval;
 
                 /* post blocked sends to the new connection */
                 spin_lock (&conn->ibc_lock);
@@ -1733,8 +1749,8 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
                 return (-ENOMEM);
 
         /* assume 'nid' is a new peer */
-        peer = kibnal_create_peer (msg->ibm_srcnid);
-        if (peer == NULL) {
+        rc = kibnal_create_peer(&peer, msg->ibm_srcnid);
+        if (rc != 0) {
                 CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
                        conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
                        atomic_read (&conn->ibc_refcount));
@@ -2087,7 +2103,7 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                 .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
                 .retry_count          = IBNAL_RETRY,
                 .rnr_retry_count      = IBNAL_RNR_RETRY,
-                .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
+                .cm_response_timeout  = *kibnal_tunables.kib_timeout,
                 .max_cm_retries       = IBNAL_CM_RETRY,
                 .flow_control         = IBNAL_FLOW_CONTROL,
         };
@@ -2164,7 +2180,7 @@ kibnal_connect_peer (kib_peer_t *peer)
                                     conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid,
                                     conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey,
                                     0,
-                                    kibnal_tunables.kib_io_timeout * HZ,
+                                    *kibnal_tunables.kib_timeout * HZ,
                                     0,
                                     kibnal_pathreq_callback, conn, 
                                     &conn->ibc_connreq->cr_tid);
@@ -2354,9 +2370,9 @@ kibnal_reaper (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (kibnal_tunables.kib_io_timeout > n * p)
+                        if (*kibnal_tunables.kib_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        kibnal_tunables.kib_io_timeout;
+                                        *kibnal_tunables.kib_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -2418,8 +2434,7 @@ kibnal_connd (void *arg)
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         kibnal_handle_svcqry(as->ibas_sock);
-                        sock_release(as->ibas_sock);
-                        PORTAL_FREE(as, sizeof(*as));
+                        kibnal_free_acceptsock(as);
                         
                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
                         did_something = 1;
diff --git a/lnet/klnds/openiblnd/openiblnd_modparams.c b/lnet/klnds/openiblnd/openiblnd_modparams.c
new file mode 100644 (file)
index 0000000..eb9c130
--- /dev/null
@@ -0,0 +1,149 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "openibnal.h"
+
+static int n_connd = IBNAL_N_CONND;
+CFS_MODULE_PARM(n_connd, "i", int, 0444,
+                "# of connection daemons");
+
+static int min_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = IBNAL_MAX_RECONNECT_INTERVAL;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = IBNAL_CONCURRENT_PEERS;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = IBNAL_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = IBNAL_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int listener_timeout = IBNAL_LISTENER_TIMEOUT;
+CFS_MODULE_PARM(listener_timeout, "i", int, 0644,
+               "passive connection timeout (seconds)");
+
+static int backlog = IBNAL_BACKLOG;
+CFS_MODULE_PARM(backlog, "i", int, 0444,
+               "passive connection (listen) backlog");
+
+static int port = IBNAL_PORT;
+CFS_MODULE_PARM(port, "i", int, 0444,
+               "connection request TCP/IP port");
+
+static int ntx = IBNAL_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of 'normal' message descriptors");
+
+static int ntx_nblk = IBNAL_NTX_NBLK;
+CFS_MODULE_PARM(ntx_nblk, "i", int, 0444,
+               "# of 'reserved' message descriptors");
+
+kib_tunables_t kibnal_tunables = {
+       .kib_n_connd                = &n_connd,
+        .kib_min_reconnect_interval = &min_reconnect_interval,
+        .kib_max_reconnect_interval = &max_reconnect_interval,
+        .kib_concurrent_peers       = &concurrent_peers,
+       .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_listener_timeout       = &listener_timeout,
+       .kib_backlog                = &backlog,
+       .kib_port                   = &port,
+        .kib_ntx                    = &ntx,
+        .kib_ntx_nblk               = &ntx_nblk,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static ctl_table kibnal_ctl_table[] = {
+       {1, "n_connd", &n_connd, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {3, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "concurrent_peers", &concurrent_peers, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {5, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {7, "listener_timeout", &listener_timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {8, "backlog", &backlog, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {9, "port", &port, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "ntx_nblk", &ntx_nblk, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+       {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
+       {0}
+};
+
+int
+kibnal_tunables_init ()
+{
+       kibnal_tunables.kib_sysctl =
+               register_sysctl_table(kibnal_top_ctl_table, 0);
+       
+       if (kibnal_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+       if (kibnal_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif