Whamcloud - gitweb
b=5684
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_cb.c
index b22d501..33b721d 100644 (file)
@@ -24,9 +24,6 @@
  */
 
 #include "socknal.h"
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-# include <linux/syscalls.h>
-#endif
 
 /*
  *  LIB functions follow
@@ -53,174 +50,77 @@ ksocknal_free_ltx (ksock_ltx_t *ltx)
         PORTAL_FREE(ltx, ltx->ltx_desc_size);
 }
 
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-struct page *
-ksocknal_kvaddr_to_page (unsigned long vaddr)
-{
-        struct page *page;
-
-        if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END)
-                page = vmalloc_to_page ((void *)vaddr);
-#if CONFIG_HIGHMEM
-        else if (vaddr >= PKMAP_BASE &&
-                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
-                page = vmalloc_to_page ((void *)vaddr);
-                /* in 2.4 ^ just walks the page tables */
-#endif
-        else
-                page = virt_to_page (vaddr);
-
-        if (page == NULL ||
-            !VALID_PAGE (page))
-                return (NULL);
-
-        return (page);
-}
-#endif
-
 int
 ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
-{
-        struct socket *sock = conn->ksnc_sock;
+{ 
         struct iovec  *iov = tx->tx_iov;
-        int            fragsize = iov->iov_len;
-        unsigned long  vaddr = (unsigned long)iov->iov_base;
-        int            more = (tx->tx_niov > 1) || 
-                              (tx->tx_nkiov > 0) ||
-                              (!list_empty (&conn->ksnc_tx_queue));
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-        int            offset = vaddr & (PAGE_SIZE - 1);
-        int            zcsize = MIN (fragsize, PAGE_SIZE - offset);
-        struct page   *page;
-#endif
-        int            rc;
+        int    nob;
+        int    rc;
 
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone, so we only send 1 frag at a time. */
-        LASSERT (fragsize <= tx->tx_resid);
         LASSERT (tx->tx_niov > 0);
-        
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-        if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
-            (sock->sk->route_caps & NETIF_F_SG) &&
-            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
-            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
-                
-                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
-                       (void *)vaddr, page, page_address(page), offset, zcsize);
 
-                if (fragsize > zcsize) {
-                        more = 1;
-                        fragsize = zcsize;
-                }
+        /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+        rc = ksocknal_lib_send_iov(conn, tx);
 
-                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, 
-                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
-                                       &tx->tx_zccd);
-        } else
-#endif
-        {
-                /* NB don't pass tx's iov; sendmsg may or may not update it */
-                struct iovec fragiov = { .iov_base = (void *)vaddr,
-                                         .iov_len  = fragsize};
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &fragiov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
-                };
-                mm_segment_t oldmm = get_fs();
-
-                set_fs (KERNEL_DS);
-                rc = sock_sendmsg(sock, &msg, fragsize);
-                set_fs (oldmm);
-        } 
+        if (rc <= 0)                            /* sent nothing? */ 
+                return (rc);
 
-        if (rc > 0) {
-                tx->tx_resid -= rc;
+        nob = rc; 
+        LASSERT (nob <= tx->tx_resid); 
+        tx->tx_resid -= nob;
+
+        /* "consume" iov */ 
+        do { 
+                LASSERT (tx->tx_niov > 0); 
+
+                if (nob < iov->iov_len) { 
+                        iov->iov_base = (void *)(((unsigned long)(iov->iov_base)) + nob); 
+                        iov->iov_len -= nob; 
+                        return (rc); 
+                } 
+
+                nob -= iov->iov_len; 
+                tx->tx_iov = ++iov; 
+                tx->tx_niov--; 
+        } while (nob != 0);
 
-                if (rc < iov->iov_len) {
-                        /* didn't send whole iov entry... */
-                        iov->iov_base = (void *)(vaddr + rc);
-                        iov->iov_len -= rc;
-                } else {
-                        tx->tx_iov++;
-                        tx->tx_niov--;
-                }
-        }
-        
         return (rc);
 }
 
 int
 ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
-{
-        struct socket *sock = conn->ksnc_sock;
+{ 
         ptl_kiov_t    *kiov = tx->tx_kiov;
-        int            fragsize = kiov->kiov_len;
-        struct page   *page = kiov->kiov_page;
-        int            offset = kiov->kiov_offset;
-        int            more = (tx->tx_nkiov > 1) ||
-                              (!list_empty (&conn->ksnc_tx_queue));
-        int            rc;
+        int     nob;
+        int     rc;
 
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone, so we only send 1 frag at a time. */
-        LASSERT (fragsize <= tx->tx_resid);
-        LASSERT (offset + fragsize <= PAGE_SIZE);
-        LASSERT (tx->tx_niov == 0);
+        LASSERT (tx->tx_niov == 0); 
         LASSERT (tx->tx_nkiov > 0);
 
-#if SOCKNAL_ZC
-        if (fragsize >= ksocknal_tunables.ksnd_zc_min_frag &&
-            (sock->sk->route_caps & NETIF_F_SG) &&
-            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
+        /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+        rc = ksocknal_lib_send_kiov(conn, tx);
 
-                CDEBUG(D_NET, "page %p + offset %x for %d\n",
-                               page, offset, fragsize);
+        if (rc <= 0)                            /* sent nothing? */ 
+                return (rc); 
+        
+        nob = rc; 
+        LASSERT (nob <= tx->tx_resid); 
+        tx->tx_resid -= nob; 
 
-                rc = tcp_sendpage_zccd(sock, page, offset, fragsize,
-                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
-                                       &tx->tx_zccd);
-        } else
-#endif
-        {
-                char *addr = ((char *)kmap (page)) + offset;
-                struct iovec fragiov = {.iov_base = addr,
-                                        .iov_len  = fragsize};
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &fragiov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
-                };
-                mm_segment_t  oldmm = get_fs();
-                
-                set_fs (KERNEL_DS);
-                rc = sock_sendmsg(sock, &msg, fragsize);
-                set_fs (oldmm);
+        /* "consume" kiov */ 
+        do { 
+                LASSERT(tx->tx_nkiov > 0); 
 
-                kunmap (page);
-        }
+                if (nob < kiov->kiov_len) { 
+                        kiov->kiov_offset += nob; 
+                        kiov->kiov_len -= nob; 
+                        return rc; 
+                } 
 
-        if (rc > 0) {
-                tx->tx_resid -= rc;
-                if (rc < fragsize) {
-                        kiov->kiov_offset = offset + rc;
-                        kiov->kiov_len    = fragsize - rc;
-                } else {
-                        tx->tx_kiov++;
-                        tx->tx_nkiov--;
-                }
-        }
+                nob -= kiov->kiov_len; 
+                tx->tx_kiov = ++kiov; 
+                tx->tx_nkiov--; 
+        } while (nob != 0);
 
         return (rc);
 }
@@ -233,7 +133,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         
         if (ksocknal_data.ksnd_stall_tx != 0) {
                 set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout (ksocknal_data.ksnd_stall_tx * HZ);
+                schedule_timeout (cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
         }
 
         LASSERT (tx->tx_resid != 0);
@@ -255,49 +155,48 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                         rc = ksocknal_send_kiov (conn, tx);
                 }
 
-                bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
+                bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock);
                 if (rc > 0)                     /* sent something? */
                         conn->ksnc_tx_bufnob += rc; /* account it */
                 
                 if (bufnob < conn->ksnc_tx_bufnob) {
                         /* allocated send buffer bytes < computed; infer
                          * something got ACKed */
-                        conn->ksnc_tx_deadline = jiffies + 
-                                                 ksocknal_tunables.ksnd_io_timeout * HZ;
-                        conn->ksnc_peer->ksnp_last_alive = jiffies;
+                        conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
+                        conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
                         conn->ksnc_tx_bufnob = bufnob;
                         mb();
                 }
 
-                if (rc <= 0) {
-                        /* Didn't write anything.
-                         *
-                         * NB: rc == 0 and rc == -EAGAIN both mean try
-                         * again later (linux stack returns -EAGAIN for
-                         * this, but Adaptech TOE returns 0).
-                         *
-                         * Also, sends never fail with -ENOMEM, just
-                         * -EAGAIN, but with the added bonus that we can't
-                         * expect write_space() to call us back to tell us
-                         * when to try sending again.  We use the
-                         * SOCK_NOSPACE flag to diagnose...  */
-
-                        LASSERT(rc != -ENOMEM);
-
-                        if (rc == 0 || rc == -EAGAIN) {
-                                if (test_bit(SOCK_NOSPACE, 
-                                             &conn->ksnc_sock->flags)) {
-                                        rc = -EAGAIN;
-                                } else {
-                                        static int counter;
-                         
-                                        counter++;
-                                        if ((counter & (-counter)) == counter)
-                                                CWARN("%d ENOMEM tx %p\n", 
-                                                      counter, conn);
-                                        rc = -ENOMEM;
-                                }
+                if (rc <= 0) { /* Didn't write anything? */
+                        unsigned long  flags;
+                        ksock_sched_t *sched;
+
+                        if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+                                rc = -EAGAIN;
+
+                        if (rc != -EAGAIN)
+                                break;
+
+                        /* Check if EAGAIN is due to memory pressure */
+
+                        sched = conn->ksnc_scheduler;
+                        spin_lock_irqsave(&sched->kss_lock, flags);
+                                
+                        if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+                            !conn->ksnc_tx_ready) {
+                                /* SOCK_NOSPACE is set when the socket fills
+                                 * and cleared in the write_space callback
+                                 * (which also sets ksnc_tx_ready).  If
+                                 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                                 * zero, I didn't fill the socket and
+                                 * write_space won't reschedule me, so I
+                                 * return -ENOMEM to get my caller to retry
+                                 * after a timeout */
+                                rc = -ENOMEM;
                         }
+
+                        spin_unlock_irqrestore(&sched->kss_lock, flags);
                         break;
                 }
 
@@ -311,135 +210,91 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         return (rc);
 }
 
-void
-ksocknal_eager_ack (ksock_conn_t *conn)
-{
-        int            opt = 1;
-        mm_segment_t   oldmm = get_fs();
-        struct socket *sock = conn->ksnc_sock;
-        
-        /* Remind the socket to ACK eagerly.  If I don't, the socket might
-         * think I'm about to send something it could piggy-back the ACK
-         * on, introducing delay in completing zero-copy sends in my
-         * peer. */
-
-        set_fs(KERNEL_DS);
-        sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
-                               (char *)&opt, sizeof (opt));
-        set_fs(oldmm);
-}
-
 int
 ksocknal_recv_iov (ksock_conn_t *conn)
-{
+{ 
         struct iovec *iov = conn->ksnc_rx_iov;
-        int           fragsize  = iov->iov_len;
-        unsigned long vaddr = (unsigned long)iov->iov_base;
-        struct iovec  fragiov = { .iov_base = (void *)vaddr,
-                                  .iov_len  = fragsize};
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &fragiov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = 0
-        };
-        mm_segment_t oldmm = get_fs();
-        int          rc;
-
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone, so we only receive 1 frag at a time. */
+        int     nob;
+        int     rc;
+
         LASSERT (conn->ksnc_rx_niov > 0);
-        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
 
-        set_fs (KERNEL_DS);
-        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
-        /* NB this is just a boolean............................^ */
-        set_fs (oldmm);
+        /* Never touch conn->ksnc_rx_iov or change connection 
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_iov(conn); 
 
-        if (rc <= 0)
-                return (rc);
+        if (rc <= 0) 
+                return (rc); 
 
-        /* received something... */
-        conn->ksnc_peer->ksnp_last_alive = jiffies;
-        conn->ksnc_rx_deadline = jiffies + 
-                                 ksocknal_tunables.ksnd_io_timeout * HZ;
-        mb();                           /* order with setting rx_started */
-        conn->ksnc_rx_started = 1;
+        /* received something... */ 
+        nob = rc; 
+        
+        conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); 
+        conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); 
+        mb();                           /* order with setting rx_started */ 
+        conn->ksnc_rx_started = 1; 
+        
+        conn->ksnc_rx_nob_wanted -= nob; 
+        conn->ksnc_rx_nob_left -= nob;
 
-        conn->ksnc_rx_nob_wanted -= rc;
-        conn->ksnc_rx_nob_left -= rc;
-                
-        if (rc < fragsize) {
-                iov->iov_base = (void *)(vaddr + rc);
-                iov->iov_len = fragsize - rc;
-                return (-EAGAIN);
-        }
+        do { 
+                LASSERT (conn->ksnc_rx_niov > 0); 
 
-        conn->ksnc_rx_iov++;
-        conn->ksnc_rx_niov--;
-        return (1);
+                if (nob < iov->iov_len) { 
+                        iov->iov_len -= nob; 
+                        iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); 
+                        return (-EAGAIN); 
+                } 
+
+                nob -= iov->iov_len; 
+                conn->ksnc_rx_iov = ++iov; 
+                conn->ksnc_rx_niov--; 
+        } while (nob != 0);
+
+        return (rc);
 }
 
 int
 ksocknal_recv_kiov (ksock_conn_t *conn)
 {
         ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
-        struct page  *page = kiov->kiov_page;
-        int           offset = kiov->kiov_offset;
-        int           fragsize = kiov->kiov_len;
-        unsigned long vaddr = ((unsigned long)kmap (page)) + offset;
-        struct iovec  fragiov = { .iov_base = (void *)vaddr,
-                                  .iov_len  = fragsize};
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &fragiov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = 0
-        };
-        mm_segment_t oldmm = get_fs();
-        int          rc;
-
-        /* NB we can't trust socket ops to either consume our iovs
-         * or leave them alone, so we only receive 1 frag at a time. */
-        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        int     nob;
+        int     rc;
         LASSERT (conn->ksnc_rx_nkiov > 0);
-        LASSERT (offset + fragsize <= PAGE_SIZE);
 
-        set_fs (KERNEL_DS);
-        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
-        /* NB this is just a boolean............................^ */
-        set_fs (oldmm);
-
-        kunmap (page);
+        /* Never touch conn->ksnc_rx_kiov or change connection 
+         * status inside ksocknal_lib_recv_iov */
+        rc = ksocknal_lib_recv_kiov(conn); 
         
-        if (rc <= 0)
-                return (rc);
+        if (rc <= 0) 
+                return (rc); 
         
-        /* received something... */
-        conn->ksnc_peer->ksnp_last_alive = jiffies;
-        conn->ksnc_rx_deadline = jiffies + 
-                                 ksocknal_tunables.ksnd_io_timeout * HZ;
-        mb();                           /* order with setting rx_started */
+        /* received something... */ 
+        nob = rc; 
+
+        conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); 
+        conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); 
+        mb();                           /* order with setting rx_started */ 
         conn->ksnc_rx_started = 1;
 
-        conn->ksnc_rx_nob_wanted -= rc;
-        conn->ksnc_rx_nob_left -= rc;
-                
-        if (rc < fragsize) {
-                kiov->kiov_offset = offset + rc;
-                kiov->kiov_len = fragsize - rc;
-                return (-EAGAIN);
-        }
+        conn->ksnc_rx_nob_wanted -= nob; 
+        conn->ksnc_rx_nob_left -= nob; 
+        
+        do { 
+                LASSERT (conn->ksnc_rx_nkiov > 0); 
+
+                if (nob < kiov->kiov_len) { 
+                        kiov->kiov_offset += nob; 
+                        kiov->kiov_len -= nob; 
+                        return -EAGAIN; 
+                } 
 
-        conn->ksnc_rx_kiov++;
-        conn->ksnc_rx_nkiov--;
-        return (1);
+                nob -= kiov->kiov_len; 
+                conn->ksnc_rx_kiov = ++kiov; 
+                conn->ksnc_rx_nkiov--; 
+        } while (nob != 0);
+
+        return 1;
 }
 
 int
@@ -453,7 +308,7 @@ ksocknal_receive (ksock_conn_t *conn)
         
         if (ksocknal_data.ksnd_stall_rx != 0) {
                 set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout (ksocknal_data.ksnd_stall_rx * HZ);
+                schedule_timeout(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
         }
 
         rc = ksocknal_getconnsock (conn);
@@ -487,7 +342,7 @@ ksocknal_receive (ksock_conn_t *conn)
                             (conn->ksnc_rx_state ==  SOCKNAL_RX_BODY ||
                              conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) {
                                 /* Remind the socket to ack eagerly... */
-                                ksocknal_eager_ack(conn);
+                                ksocknal_lib_eager_ack(conn);
                         }
                         rc = 1;
                         break;
@@ -512,7 +367,7 @@ ksocknal_zc_callback (zccd_t *zcd)
         spin_lock_irqsave (&sched->kss_lock, flags);
 
         list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
-        wake_up (&sched->kss_waitq);
+        cfs_waitq_signal (&sched->kss_waitq);
 
         spin_unlock_irqrestore (&sched->kss_lock, flags);
         EXIT;
@@ -599,6 +454,12 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                 return (rc);
 
         if (rc == -ENOMEM) {
+                static int counter;
+
+                counter++;   /* exponential backoff warnings */
+                if ((counter & (-counter)) == counter)
+                        CWARN("%d ENOMEM tx %p\n", counter, conn);
+
                 /* Queue on ksnd_enomem_conns for retry after a timeout */
                 spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags);
 
@@ -606,9 +467,10 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                 LASSERT (conn->ksnc_tx_scheduled);
                 list_add_tail(&conn->ksnc_tx_list,
                               &ksocknal_data.ksnd_enomem_conns);
-                if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY,
+                if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+                                                   SOCKNAL_ENOMEM_RETRY),
                                    ksocknal_data.ksnd_reaper_waketime))
-                        wake_up (&ksocknal_data.ksnd_reaper_waitq);
+                        cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
                 
                 spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags);
                 return (rc);
@@ -617,12 +479,26 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
         /* Actual error */
         LASSERT (rc < 0);
 
-        if (!conn->ksnc_closing)
+        if (!conn->ksnc_closing) {
+                switch (rc) {
+                case -ECONNRESET:
+                        LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+                                      "while we were sending data; it may have "
+                                      "rebooted.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr));
+                        break;
+                default:
+                        LCONSOLE_WARN("There was an unexpected network error "
+                                      "while writing to %u.%u.%u.%u: %d.\n",
+                                      HIPQUAD(conn->ksnc_ipaddr), rc);
+                        break;
+                }
                 CERROR("[%p] Error %d on write to "LPX64
                        " ip %d.%d.%d.%d:%d\n", conn, rc,
                        conn->ksnc_peer->ksnp_nid,
                        HIPQUAD(conn->ksnc_ipaddr),
                        conn->ksnc_port);
+        }
 
         ksocknal_close_conn_and_siblings (conn, rc);
         ksocknal_tx_launched (tx);
@@ -654,7 +530,7 @@ ksocknal_launch_autoconnect_locked (ksock_route_t *route)
         
         list_add_tail (&route->ksnr_connect_list,
                        &ksocknal_data.ksnd_autoconnectd_routes);
-        wake_up (&ksocknal_data.ksnd_autoconnectd_waitq);
+        cfs_waitq_signal (&ksocknal_data.ksnd_autoconnectd_waitq);
         
         spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
 }
@@ -710,7 +586,7 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
                 const int     nob = 0;
 #else
                 int           nob = atomic_read(&c->ksnc_tx_nob) +
-                                        c->ksnc_sock->sk->sk_wmem_queued;
+                                        SOCK_WMEM_QUEUED(c->ksnc_sock);
 #endif
                 LASSERT (!c->ksnc_closing);
 
@@ -787,10 +663,9 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
         spin_lock_irqsave (&sched->kss_lock, flags);
 
         if (list_empty(&conn->ksnc_tx_queue) &&
-            conn->ksnc_sock->sk->sk_wmem_queued == 0) {
+            SOCK_WMEM_QUEUED(conn->ksnc_sock) == 0) {
                 /* First packet starts the timeout */
-                conn->ksnc_tx_deadline = jiffies +
-                                         ksocknal_tunables.ksnd_io_timeout * HZ;
+                conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
                 conn->ksnc_tx_bufnob = 0;
                 mb();    /* order with adding to tx_queue */
         }
@@ -804,7 +679,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
                 list_add_tail (&conn->ksnc_tx_list, 
                                &sched->kss_tx_conns);
                 conn->ksnc_tx_scheduled = 1;
-                wake_up (&sched->kss_waitq);
+                cfs_waitq_signal (&sched->kss_waitq);
         }
 
         spin_unlock_irqrestore (&sched->kss_lock, flags);
@@ -834,7 +709,7 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
                         continue;
 
                 /* too soon to retry this guy? */
-                if (!time_after_eq (jiffies, route->ksnr_timeout))
+                if (!cfs_time_aftereq (cfs_time_current(), route->ksnr_timeout))
                         continue;
                 
                 return (route);
@@ -1103,7 +978,7 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 int
 ksocknal_thread_start (int (*fn)(void *arg), void *arg)
 {
-        long          pid = kernel_thread (fn, arg, 0);
+        long          pid = cfs_kernel_thread (fn, arg, 0);
         unsigned long flags;
 
         if (pid < 0)
@@ -1179,7 +1054,7 @@ ksocknal_fmb_callback (void *arg, int error)
         spin_lock_irqsave (&sched->kss_lock, flags);
 
         list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
-        wake_up (&sched->kss_waitq);
+        cfs_waitq_signal (&sched->kss_waitq);
 
         spin_unlock_irqrestore (&sched->kss_lock, flags);
 }
@@ -1195,7 +1070,7 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn)
         LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
         LASSERT (kpr_routing(&ksocknal_data.ksnd_router));
 
-        if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+        if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * CFS_PAGE_SIZE)
                 pool = &ksocknal_data.ksnd_small_fmp;
         else
                 pool = &ksocknal_data.ksnd_large_fmp;
@@ -1235,8 +1110,8 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
         LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
         LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
         LASSERT (payload_nob >= 0);
-        LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * PAGE_SIZE);
-        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+        LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * CFS_PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < CFS_PAGE_SIZE);
         LASSERT (fmb->fmb_kiov[0].kiov_offset == 0);
 
         /* Take a ref on the conn's peer to prevent module unload before
@@ -1253,8 +1128,8 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
         while (nob > 0) {
                 LASSERT (niov < fmb->fmb_pool->fmp_buff_pages);
                 LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0);
-                fmb->fmb_kiov[niov].kiov_len = MIN (PAGE_SIZE, nob);
-                nob -= PAGE_SIZE;
+                fmb->fmb_kiov[niov].kiov_len = MIN (CFS_PAGE_SIZE, nob);
+                nob -= CFS_PAGE_SIZE;
                 niov++;
         }
 
@@ -1783,177 +1658,78 @@ int ksocknal_scheduler (void *arg)
         return (0);
 }
 
-void
-ksocknal_data_ready (struct sock *sk, int n)
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
 {
+        ksock_sched_t *sched; 
         unsigned long  flags;
-        ksock_conn_t  *conn;
-        ksock_sched_t *sched;
         ENTRY;
 
-        /* interleave correctly with closing sockets... */
-        read_lock (&ksocknal_data.ksnd_global_lock);
+        sched = conn->ksnc_scheduler; 
 
-        conn = sk->sk_user_data;
-        if (conn == NULL) {             /* raced with ksocknal_terminate_conn */
-                LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
-                sk->sk_data_ready (sk, n);
-        } else {
-                sched = conn->ksnc_scheduler;
-
-                spin_lock_irqsave (&sched->kss_lock, flags);
+        spin_lock_irqsave (&sched->kss_lock, flags); 
 
-                conn->ksnc_rx_ready = 1;
+        conn->ksnc_rx_ready = 1; 
 
-                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
-                        list_add_tail(&conn->ksnc_rx_list,
-                                      &sched->kss_rx_conns);
-                        conn->ksnc_rx_scheduled = 1;
-                        /* extra ref for scheduler */
-                        atomic_inc (&conn->ksnc_refcount);
+        if (!conn->ksnc_rx_scheduled) {  /* not being progressed */ 
+                list_add_tail(&conn->ksnc_rx_list, 
+                              &sched->kss_rx_conns); 
+                conn->ksnc_rx_scheduled = 1; 
+                /* extra ref for scheduler */ 
+                atomic_inc (&conn->ksnc_refcount); 
 
-                        wake_up (&sched->kss_waitq);
-                }
-
-                spin_unlock_irqrestore (&sched->kss_lock, flags);
-        }
-
-        read_unlock (&ksocknal_data.ksnd_global_lock);
+                cfs_waitq_signal (&sched->kss_waitq); 
+        } 
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
 
         EXIT;
-}
+} 
 
-void
-ksocknal_write_space (struct sock *sk)
-{
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{ 
+        ksock_sched_t *sched; 
         unsigned long  flags;
-        ksock_conn_t  *conn;
-        ksock_sched_t *sched;
-
-        /* interleave correctly with closing sockets... */
-        read_lock (&ksocknal_data.ksnd_global_lock);
-
-        conn = sk->sk_user_data;
-
-        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
-               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
-               (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
-                                      " ready" : " blocked"),
-               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
-                                      " scheduled" : " idle"),
-               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
-                                      " empty" : " queued"));
-
-        if (conn == NULL) {             /* raced with ksocknal_terminate_conn */
-                LASSERT (sk->sk_write_space != &ksocknal_write_space);
-                sk->sk_write_space (sk);
-
-                read_unlock (&ksocknal_data.ksnd_global_lock);
-                return;
-        }
-
-        if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
-                clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
-
-                sched = conn->ksnc_scheduler;
+        ENTRY;
+        
+        sched = conn->ksnc_scheduler; 
 
-                spin_lock_irqsave (&sched->kss_lock, flags);
+        spin_lock_irqsave (&sched->kss_lock, flags); 
 
-                conn->ksnc_tx_ready = 1;
+        conn->ksnc_tx_ready = 1; 
 
-                if (!conn->ksnc_tx_scheduled && // not being progressed
-                    !list_empty(&conn->ksnc_tx_queue)){//packets to send
-                        list_add_tail (&conn->ksnc_tx_list,
-                                       &sched->kss_tx_conns);
-                        conn->ksnc_tx_scheduled = 1;
-                        /* extra ref for scheduler */
-                        atomic_inc (&conn->ksnc_refcount);
+        if (!conn->ksnc_tx_scheduled && // not being progressed 
+            !list_empty(&conn->ksnc_tx_queue)){//packets to send 
+                list_add_tail (&conn->ksnc_tx_list, 
+                               &sched->kss_tx_conns); 
+                conn->ksnc_tx_scheduled = 1; 
+                /* extra ref for scheduler */ 
+                atomic_inc (&conn->ksnc_refcount); 
 
-                        wake_up (&sched->kss_waitq);
-                }
+                cfs_waitq_signal (&sched->kss_waitq); 
+        } 
 
-                spin_unlock_irqrestore (&sched->kss_lock, flags);
-        }
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
 
-        read_unlock (&ksocknal_data.ksnd_global_lock);
+        EXIT;
 }
 
 int
 ksocknal_sock_write (struct socket *sock, void *buffer, int nob)
 {
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                set_fs (KERNEL_DS);
-                rc = sock_sendmsg (sock, &msg, iov.iov_len);
-                set_fs (oldmm);
-                
-                if (rc < 0)
-                        return (rc);
-
-                if (rc == 0) {
-                        CERROR ("Unexpected zero rc\n");
-                        return (-ECONNABORTED);
-                }
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-        }
-        
-        return (0);
+        return ksocknal_lib_sock_write(sock, buffer, nob);
 }
 
 int
 ksocknal_sock_read (struct socket *sock, void *buffer, int nob)
 {
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                set_fs (KERNEL_DS);
-                rc = sock_recvmsg (sock, &msg, iov.iov_len, 0);
-                set_fs (oldmm);
-                
-                if (rc < 0)
-                        return (rc);
-
-                if (rc == 0)
-                        return (-ECONNABORTED);
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-        }
-        
-        return (0);
+        return ksocknal_lib_sock_read(sock, buffer, nob);
 }
 
 int
@@ -2036,6 +1812,7 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         int                 type;
         ptl_hdr_t           hdr;
         ptl_magicversion_t *hmv;
+        char                ipbuf[PTL_NALFMT_SIZE];
 
         hmv = (ptl_magicversion_t *)&hdr.dest_nid;
         LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
@@ -2098,6 +1875,16 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
         if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
                 *nid = le64_to_cpu(hdr.src_nid);
         } else if (*nid != le64_to_cpu (hdr.src_nid)) {
+                LCONSOLE_ERROR("Connected successfully to nid "LPX64" on host "
+                               "%u.%u.%u.%u, but they claimed they were nid "
+                               LPX64" (%s); please check your Lustre "
+                               "configuration.\n",
+                               *nid, HIPQUAD(conn->ksnc_ipaddr),
+                               le64_to_cpu(hdr.src_nid),
+                               portals_nid2str(SOCKNAL,
+                                               le64_to_cpu(hdr.src_nid),
+                                               ipbuf));
+                               
                 CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u "
                         "but expecting "LPX64"\n",
                         le64_to_cpu (hdr.src_nid),
@@ -2159,282 +1946,50 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
 int
 ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
-        mm_segment_t   oldmm = get_fs ();
-        struct socket *sock = conn->ksnc_sock;
-        int            len;
-        int            rc;
-
-        rc = ksocknal_getconnsock (conn);
-        if (rc != 0) {
-                LASSERT (conn->ksnc_closing);
-                *txmem = *rxmem = *nagle = 0;
-                return (-ESHUTDOWN);
-        }
-        
-        set_fs (KERNEL_DS);
-
-        len = sizeof(*txmem);
-        rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)txmem, &len);
-        if (rc == 0) {
-                len = sizeof(*rxmem);
-                rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF,
-                                     (char *)rxmem, &len);
-        }
-        if (rc == 0) {
-                len = sizeof(*nagle);
-                rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
-                                           (char *)nagle, &len);
-        }
-
-        set_fs (oldmm);
-        ksocknal_putconnsock (conn);
-
-        if (rc == 0)
-                *nagle = !*nagle;
-        else
-                *txmem = *rxmem = *nagle = 0;
-                
-        return (rc);
-}
-
-int
-ksocknal_setup_sock (struct socket *sock)
-{
-        mm_segment_t    oldmm = get_fs ();
-        int             rc;
-        int             option;
-        int             keep_idle;
-        int             keep_intvl;
-        int             keep_count;
-        int             do_keepalive;
-        struct linger   linger;
-
-        sock->sk->sk_allocation = GFP_NOFS;
-
-        /* Ensure this socket aborts active sends immediately when we close
-         * it. */
-
-        linger.l_onoff = 0;
-        linger.l_linger = 0;
-
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
-                              (char *)&linger, sizeof (linger));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set SO_LINGER: %d\n", rc);
-                return (rc);
-        }
-
-        option = -1;
-        set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
-                                    (char *)&option, sizeof (option));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set SO_LINGER2: %d\n", rc);
-                return (rc);
-        }
-
-        if (!ksocknal_tunables.ksnd_nagle) {
-                option = 1;
-                
-                set_fs (KERNEL_DS);
-                rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
-                                            (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't disable nagle: %d\n", rc);
-                        return (rc);
-                }
-        }
-        
-        if (ksocknal_tunables.ksnd_buffer_size > 0) {
-                option = ksocknal_tunables.ksnd_buffer_size;
-                
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set send buffer %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
-
-                set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
-                                      (char *)&option, sizeof (option));
-                set_fs (oldmm);
-                if (rc != 0) {
-                        CERROR ("Can't set receive buffer %d: %d\n",
-                                option, rc);
-                        return (rc);
-                }
-        }
-
-        /* snapshot tunables */
-        keep_idle  = ksocknal_tunables.ksnd_keepalive_idle;
-        keep_count = ksocknal_tunables.ksnd_keepalive_count;
-        keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
-        
-        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
-
-        option = (do_keepalive ? 1 : 0);
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, 
-                              (char *)&option, sizeof (option));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
-                return (rc);
-        }
-
-        if (!do_keepalive)
-                return (0);
-
-        set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
-                                    (char *)&keep_idle, sizeof (keep_idle));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
-                return (rc);
-        }
-
-        set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
-                                    (char *)&keep_intvl, sizeof (keep_intvl));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
-                return (rc);
-        }
-
-        set_fs (KERNEL_DS);
-        rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
-                                    (char *)&keep_count, sizeof (keep_count));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
-                return (rc);
-        }
-
-        return (0);
+        return ksocknal_lib_get_conn_tunables(conn, txmem, rxmem, nagle);
 }
 
 int
 ksocknal_connect_peer (ksock_route_t *route, int type)
 {
-        struct sockaddr_in  ipaddr;
-        mm_segment_t        oldmm = get_fs();
-        struct timeval      tv;
-        int                 fd;
         struct socket      *sock;
         int                 rc;
+        int                 port;
+        int                 may_retry;
         
-        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR ("Can't create autoconnect socket: %d\n", rc);
-                return (rc);
-        }
-
-        /* Ugh; have to map_fd for compatibility with sockets passed in
-         * from userspace.  And we actually need the sock->file refcounting
-         * that this gives you :) */
-
-        fd = sock_map_fd (sock);
-        if (fd < 0) {
-                sock_release (sock);
-                CERROR ("sock_map_fd error %d\n", fd);
-                return (fd);
-        }
+        /* Iterate through reserved ports.  When typed connections are
+         * used, we will need to bind to multiple ports, but we only know
+         * this at connect time.  But, by that time we've already called
+         * bind() so we need a new socket. */
 
-        /* NB the fd now owns the ref on sock->file */
-        LASSERT (sock->file != NULL);
-        LASSERT (file_count(sock->file) == 1);
+        for (port = 1023; port > 512; --port) {
 
-        /* Set the socket timeouts, so our connection attempt completes in
-         * finite time */
-        tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
-        tv.tv_usec = 0;
+                rc = ksocknal_lib_connect_sock(&sock, &may_retry, route, port);
 
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set send timeout %d: %d\n", 
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
-        }
-        
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set receive timeout %d: %d\n",
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
-        }
-
-        if (route->ksnr_myipaddr != 0) {
-                /* Bind to the local IP address */
-                memset (&ipaddr, 0, sizeof (ipaddr));
-                ipaddr.sin_family = AF_INET;
-                ipaddr.sin_port = htons (0); /* ANY */
-                ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr);
-
-                rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr,
-                                      sizeof (ipaddr));
-                if (rc != 0) {
-                        CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n",
-                                HIPQUAD(route->ksnr_myipaddr), rc);
-                        goto out;
+                if (rc == 0) {
+                        rc = ksocknal_create_conn(route, sock, type);
+                        cfs_put_file(KSN_SOCK2FILE(sock));
+                        return rc;
                 }
-        }
-        
-        memset (&ipaddr, 0, sizeof (ipaddr));
-        ipaddr.sin_family = AF_INET;
-        ipaddr.sin_port = htons (route->ksnr_port);
-        ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-        
-        rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr, 
-                                 sizeof (ipaddr), sock->file->f_flags);
-        if (rc != 0) {
-                CERROR ("Can't connect to nid "LPX64
-                        " local IP: %u.%u.%u.%u,"
-                        " remote IP: %u.%u.%u.%u/%d: %d\n", 
-                        route->ksnr_peer->ksnp_nid,
-                        HIPQUAD(route->ksnr_myipaddr),
-                        HIPQUAD(route->ksnr_ipaddr),
-                        route->ksnr_port, rc);
-                goto out;
-        }
 
-        rc = ksocknal_create_conn (route, sock, type);
-        if (rc == 0) {
-                /* Take an extra ref on sock->file to compensate for the
-                 * upcoming close which will lose fd's ref on it. */
-                get_file (sock->file);
+                if (!may_retry)
+                        return rc;
         }
 
- out:
-        sys_close (fd);
-        return (rc);
+        CERROR("Out of ports trying to bind to a reserved port\n");
+        return (-EADDRINUSE);
 }
 
 void
 ksocknal_autoconnect (ksock_route_t *route)
 {
-        LIST_HEAD        (zombies);
+        CFS_LIST_HEAD    (zombies);
         ksock_tx_t       *tx;
         ksock_peer_t     *peer;
         unsigned long     flags;
         int               rc;
         int               type;
+        char *err_msg = NULL;
         
         for (;;) {
                 for (type = 0; type < SOCKNAL_CONN_NTYPES; type++)
@@ -2443,7 +1998,6 @@ ksocknal_autoconnect (ksock_route_t *route)
                 LASSERT (type < SOCKNAL_CONN_NTYPES);
 
                 rc = ksocknal_connect_peer (route, type);
-
                 if (rc != 0)
                         break;
                 
@@ -2456,6 +2010,52 @@ ksocknal_autoconnect (ksock_route_t *route)
                 }
         }
 
+        switch (rc) {
+        /* "normal" errors */
+        case -ECONNREFUSED:
+                LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on "
+                               "port %d; check that Lustre is running on that "
+                               "node.\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port);
+                break;
+        case -EHOSTUNREACH:
+        case -ENETUNREACH:
+                LCONSOLE_ERROR("Host %u.%u.%u.%u was unreachable; the network "
+                               "or that node may be down, or Lustre may be "
+                               "misconfigured.\n",
+                               HIPQUAD(route->ksnr_ipaddr));
+                break;
+        case -ETIMEDOUT:
+                LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took "
+                               "too long; that node may be hung or "
+                               "experiencing high load.\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port);
+                break;
+        /* errors that should be rare */
+        case -EPROTO:
+                err_msg = "Portals could not negotiate a connection";
+                break;
+        case -EAGAIN:
+        case -EADDRINUSE:
+                /* -EAGAIN is out of ports, but we specify the ports
+                 * manually.  we really should never get this */
+                err_msg = "no privileged ports were available";
+                break;
+        default:
+                err_msg = "unknown error";
+                break;
+        }
+
+        if (err_msg) {
+                LCONSOLE_ERROR("There was an unexpected error connecting to host "
+                               "%u.%u.%u.%u on port %d: %s (error code %d).\n",
+                               HIPQUAD(route->ksnr_ipaddr),
+                               route->ksnr_port,
+                               err_msg, -rc);
+        }
+
         /* Connection attempt failed */
 
         write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
@@ -2465,7 +2065,8 @@ ksocknal_autoconnect (ksock_route_t *route)
 
         /* This is a retry rather than a new connection */
         LASSERT (route->ksnr_retry_interval != 0);
-        route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
+        route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                           route->ksnr_retry_interval);
         route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2,
                                           SOCKNAL_MAX_RECONNECT_INTERVAL);
 
@@ -2576,34 +2177,67 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer)
                 /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
                 LASSERT (!conn->ksnc_closing);
 
-                if (conn->ksnc_sock->sk->sk_err != 0) {
-                        /* Something (e.g. failed keepalive) set the socket error */
+                if (SOCK_ERROR(conn->ksnc_sock) != 0) {
                         atomic_inc (&conn->ksnc_refcount);
+
+                        switch (SOCK_ERROR(conn->ksnc_sock)) {
+                        case ECONNRESET:
+                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
+                                              "was reset; they may have "
+                                              "rebooted.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr));
+                                break;
+                        case ETIMEDOUT:
+                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
+                                              "timed out; the network or that "
+                                              "node may be down.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr));
+                                break;
+                        default:
+                                LCONSOLE_WARN("An unexpected network error "
+                                              "occurred with %u.%u.%u.%u: %d.\n",
+                                              HIPQUAD(conn->ksnc_ipaddr),
+                                              SOCK_ERROR(conn->ksnc_sock));
+                                break;
+                        }
+
+                        /* Something (e.g. failed keepalive) set the socket error */
                         CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n",
-                                conn->ksnc_sock->sk->sk_err, peer->ksnp_nid,
+                                SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid,
                                 conn, HIPQUAD(conn->ksnc_ipaddr));
+
                         return (conn);
                 }
 
                 if (conn->ksnc_rx_started &&
-                    time_after_eq (jiffies, conn->ksnc_rx_deadline)) {
+                    cfs_time_aftereq (cfs_time_current(), 
+                                      conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
                         atomic_inc (&conn->ksnc_refcount);
+                        LCONSOLE_ERROR("A timeout occurred receiving data from "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
                         CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr));
                         return (conn);
                 }
 
                 if ((!list_empty (&conn->ksnc_tx_queue) ||
-                     conn->ksnc_sock->sk->sk_wmem_queued != 0) &&
-                    time_after_eq (jiffies, conn->ksnc_tx_deadline)) {
+                     SOCK_WMEM_QUEUED(conn->ksnc_sock) != 0) &&
+                    cfs_time_aftereq (cfs_time_current(), 
+                                      conn->ksnc_tx_deadline)) {
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
                         atomic_inc (&conn->ksnc_refcount);
+                        LCONSOLE_ERROR("A timeout occurred sending data to "
+                                       "%u.%u.%u.%u; the network or that node "
+                                       "may be down.\n",
+                                       HIPQUAD(conn->ksnc_ipaddr));
                         CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n",
                                 peer->ksnp_nid,
                                 list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
-                                conn->ksnc_sock->sk->sk_wmem_queued, conn,
+                                SOCK_WMEM_QUEUED(conn->ksnc_sock), conn,
                                 HIPQUAD(conn->ksnc_ipaddr));
                         return (conn);
                 }
@@ -2653,22 +2287,22 @@ ksocknal_check_peer_timeouts (int idx)
 int
 ksocknal_reaper (void *arg)
 {
-        wait_queue_t       wait;
+        cfs_waitlink_t     wait;
         unsigned long      flags;
         ksock_conn_t      *conn;
         ksock_sched_t     *sched;
         struct list_head   enomem_conns;
         int                nenomem_conns;
-        int                timeout;
+        cfs_duration_t     timeout;
         int                i;
         int                peer_index = 0;
-        unsigned long      deadline = jiffies;
+        cfs_time_t         deadline = cfs_time_current();
         
         kportal_daemonize ("ksocknal_reaper");
         kportal_blockallsigs ();
 
-        INIT_LIST_HEAD(&enomem_conns);
-        init_waitqueue_entry (&wait, current);
+        CFS_INIT_LIST_HEAD(&enomem_conns);
+        cfs_waitlink_init (&wait);
 
         spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
 
@@ -2722,14 +2356,15 @@ ksocknal_reaper (void *arg)
                         LASSERT (conn->ksnc_tx_scheduled);
                         conn->ksnc_tx_ready = 1;
                         list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
-                        wake_up (&sched->kss_waitq);
+                        cfs_waitq_signal (&sched->kss_waitq);
 
                         spin_unlock_irqrestore (&sched->kss_lock, flags);
                         nenomem_conns++;
                 }
                 
                 /* careful with the jiffy wrap... */
-                while ((timeout = (int)(deadline - jiffies)) <= 0) {
+                while ((timeout = cfs_time_sub(deadline, 
+                                               cfs_time_current())) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = ksocknal_data.ksnd_peer_hash_size;
@@ -2753,7 +2388,7 @@ ksocknal_reaper (void *arg)
                                              ksocknal_data.ksnd_peer_hash_size;
                         }
 
-                        deadline += p * HZ;
+                        deadline = cfs_time_add(deadline, cfs_time_seconds(p));
                 }
 
                 if (nenomem_conns != 0) {
@@ -2762,18 +2397,19 @@ ksocknal_reaper (void *arg)
                          * if any go back on my enomem list. */
                         timeout = SOCKNAL_ENOMEM_RETRY;
                 }
-                ksocknal_data.ksnd_reaper_waketime = jiffies + timeout;
+                ksocknal_data.ksnd_reaper_waketime = 
+                        cfs_time_add(cfs_time_current(), timeout);
 
                 set_current_state (TASK_INTERRUPTIBLE);
-                add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+                cfs_waitq_add (&ksocknal_data.ksnd_reaper_waitq, &wait);
 
                 if (!ksocknal_data.ksnd_shuttingdown &&
                     list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
                     list_empty (&ksocknal_data.ksnd_zombie_conns))
-                        schedule_timeout (timeout);
+                        cfs_waitq_timedwait (&wait, timeout);
 
                 set_current_state (TASK_RUNNING);
-                remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+                cfs_waitq_del (&ksocknal_data.ksnd_reaper_waitq, &wait);
 
                 spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
         }