Whamcloud - gitweb
land b1_4_bgl on HEAD (20050404_1913)
[fs/lustre-release.git] / lnet / klnds / qswlnd / qswlnd.c
index b5e1e39..be01f5d 100644 (file)
@@ -1,11 +1,8 @@
 /*
- * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Copyright (C) 2002-2004 Cluster File Systems, Inc.
  *   Author: Eric Barton <eric@bartonsoftware.com>
  *
- * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
- * W. Marcus Miller - Based on ksocknal
- *
- * This file is part of Portals, http://www.sf.net/projects/lustre/
+ * This file is part of Portals, http://www.lustre.org
  *
  * Portals is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
 
 #include "qswnal.h"
 
-ptl_handle_ni_t                kqswnal_ni;
 nal_t                  kqswnal_api;
 kqswnal_data_t         kqswnal_data;
+ptl_handle_ni_t         kqswnal_ni;
+kqswnal_tunables_t      kqswnal_tunables;
 
 kpr_nal_interface_t kqswnal_router_interface = {
        kprni_nalid:    QSWNAL,
        kprni_arg:      NULL,
        kprni_fwd:      kqswnal_fwd_packet,
+       kprni_notify:   NULL,                   /* we're connectionless */
 };
 
+#if CONFIG_SYSCTL
+#define QSWNAL_SYSCTL  201
 
-static int
-kqswnal_forward(nal_t   *nal,
-               int     id,
-               void    *args,  size_t args_len,
-               void    *ret,   size_t ret_len)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
-
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
-       return (PTL_OK);
-}
-
-static void
-kqswnal_lock (nal_t *nal, unsigned long *flags)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
+#define QSWNAL_SYSCTL_OPTIMIZED_GETS     1
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS     2
 
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       nal_cb->cb_cli(nal_cb,flags);
-}
-
-static void
-kqswnal_unlock(nal_t *nal, unsigned long *flags)
-{
-       kqswnal_data_t *k = nal->nal_data;
-       nal_cb_t       *nal_cb = k->kqn_cb;
-
-       LASSERT (nal == &kqswnal_api);
-       LASSERT (k == &kqswnal_data);
-       LASSERT (nal_cb == &kqswnal_lib);
-
-       nal_cb->cb_sti(nal_cb,flags);
-}
-
-static int
-kqswnal_shutdown(nal_t *nal, int ni)
-{
-       CDEBUG (D_NET, "shutdown\n");
-
-       LASSERT (nal == &kqswnal_api);
-       return (0);
-}
-
-static void
-kqswnal_yield( nal_t *nal )
-{
-       CDEBUG (D_NET, "yield\n");
-
-       if (current->need_resched)
-               schedule();
-       return;
-}
-
-static nal_t *
-kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
-            ptl_pid_t requested_pid)
-{
-       ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid);
-       int       nnids = kqswnal_data.kqn_nnodes;
-
-        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids);
-
-       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
+static ctl_table kqswnal_ctl_table[] = {
+       {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
+        &kqswnal_tunables.kqn_optimized_puts, sizeof (int),
+        0644, NULL, &proc_dointvec},
+       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
+        &kqswnal_tunables.kqn_optimized_gets, sizeof (int),
+        0644, NULL, &proc_dointvec},
+       {0}
+};
 
-       return (&kqswnal_api);
-}
+static ctl_table kqswnal_top_ctl_table[] = {
+       {QSWNAL_SYSCTL, "qswnal", NULL, 0, 0555, kqswnal_ctl_table},
+       {0}
+};
+#endif
 
 int
-kqswnal_get_tx_desc (struct portal_ioctl_data *data)
+kqswnal_get_tx_desc (struct portals_cfg *pcfg)
 {
        unsigned long      flags;
        struct list_head  *tmp;
        kqswnal_tx_t      *ktx;
-       int                index = data->ioc_count;
+       ptl_hdr_t         *hdr;
+       int                index = pcfg->pcfg_count;
        int                rc = -ENOENT;
 
        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
@@ -125,19 +70,20 @@ kqswnal_get_tx_desc (struct portal_ioctl_data *data)
        list_for_each (tmp, &kqswnal_data.kqn_activetxds) {
                if (index-- != 0)
                        continue;
-               
-               ktx = list_entry (tmp, kqswnal_tx_t, ktx_list);
-
-               data->ioc_pbuf1 = (char *)ktx;
-               data->ioc_count = NTOH__u32(ktx->ktx_wire_hdr->type);
-               data->ioc_size  = NTOH__u32(PTL_HDR_LENGTH(ktx->ktx_wire_hdr));
-               data->ioc_nid   = NTOH__u64(ktx->ktx_wire_hdr->dest_nid);
-               data->ioc_nid2  = ktx->ktx_nid;
-               data->ioc_misc  = ktx->ktx_launcher;
-               data->ioc_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) |
-                                 ((!ktx->ktx_forwarding)              ? 0 : 2) |
-                                 ((!ktx->ktx_isnblk)                  ? 0 : 4);
 
+               ktx = list_entry (tmp, kqswnal_tx_t, ktx_list);
+               hdr = (ptl_hdr_t *)ktx->ktx_buffer;
+
+               memcpy(pcfg->pcfg_pbuf, ktx,
+                      MIN(sizeof(*ktx), pcfg->pcfg_plen1));
+               pcfg->pcfg_count = le32_to_cpu(hdr->type);
+               pcfg->pcfg_size  = le32_to_cpu(hdr->payload_length);
+               pcfg->pcfg_nid   = le64_to_cpu(hdr->dest_nid);
+               pcfg->pcfg_nid2  = ktx->ktx_nid;
+               pcfg->pcfg_misc  = ktx->ktx_launcher;
+               pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) |
+                                 (!ktx->ktx_isnblk                    ? 0 : 2) |
+                                 (ktx->ktx_state << 2);
                rc = 0;
                break;
        }
@@ -147,21 +93,21 @@ kqswnal_get_tx_desc (struct portal_ioctl_data *data)
 }
 
 int
-kqswnal_cmd (struct portal_ioctl_data *data, void *private)
+kqswnal_cmd (struct portals_cfg *pcfg, void *private)
 {
-       LASSERT (data != NULL);
+       LASSERT (pcfg != NULL);
        
-       switch (data->ioc_nal_cmd) {
+       switch (pcfg->pcfg_command) {
        case NAL_CMD_GET_TXDESC:
-               return (kqswnal_get_tx_desc (data));
+               return (kqswnal_get_tx_desc (pcfg));
 
        case NAL_CMD_REGISTER_MYNID:
                CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n",
-                       data->ioc_nid - kqswnal_data.kqn_elanid,
+                       pcfg->pcfg_nid - kqswnal_data.kqn_elanid,
                        kqswnal_data.kqn_nid_offset);
                kqswnal_data.kqn_nid_offset =
-                       data->ioc_nid - kqswnal_data.kqn_elanid;
-               kqswnal_lib.ni.nid = data->ioc_nid;
+                       pcfg->pcfg_nid - kqswnal_data.kqn_elanid;
+               kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid;
                return (0);
                
        default:
@@ -169,21 +115,34 @@ kqswnal_cmd (struct portal_ioctl_data *data, void *private)
        }
 }
 
-void __exit
-kqswnal_finalise (void)
+static void
+kqswnal_shutdown(nal_t *nal)
 {
+       unsigned long flags;
+       kqswnal_tx_t *ktx;
+       kqswnal_rx_t *krx;
+       int           do_lib_fini = 0;
+
+       /* NB The first ref was this module! */
+       if (nal->nal_refct != 0) {
+               PORTAL_MODULE_UNUSE;
+               return;
+       }
+
+       CDEBUG (D_NET, "shutdown\n");
+       LASSERT (nal == &kqswnal_api);
+
        switch (kqswnal_data.kqn_init)
        {
        default:
                LASSERT (0);
 
        case KQN_INIT_ALL:
-               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
+                libcfs_nal_cmd_unregister(QSWNAL);
                /* fall through */
 
-       case KQN_INIT_PTL:
-               PtlNIFini (kqswnal_ni);
-               lib_fini (&kqswnal_lib);
+       case KQN_INIT_LIB:
+               do_lib_fini = 1;
                /* fall through */
 
        case KQN_INIT_DATA:
@@ -194,41 +153,93 @@ kqswnal_finalise (void)
        }
 
        /**********************************************************************/
-       /* Make router stop her calling me and fail any more call-ins */
+       /* Tell router we're shutting down.  Any router calls my threads
+        * make will now fail immediately and the router will stop calling
+        * into me. */
        kpr_shutdown (&kqswnal_data.kqn_router);
-
+       
        /**********************************************************************/
-       /* flag threads to terminate, wake them and wait for them to die */
-
+       /* Signal the start of shutdown... */
+       spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
        kqswnal_data.kqn_shuttingdown = 1;
-       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+       spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
 
-       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
-               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
-                      atomic_read (&kqswnal_data.kqn_nthreads));
+       wake_up_all(&kqswnal_data.kqn_idletxd_waitq);
+
+       /**********************************************************************/
+       /* wait for sends that have allocated a tx desc to launch or give up */
+       while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
+               CDEBUG(D_NET, "waiting for %d pending sends\n",
+                      atomic_read (&kqswnal_data.kqn_pending_txs));
                set_current_state (TASK_UNINTERRUPTIBLE);
                schedule_timeout (HZ);
        }
 
        /**********************************************************************/
        /* close elan comms */
+#if MULTIRAIL_EKC
+       /* Shut down receivers first; rx callbacks might try sending... */
+       if (kqswnal_data.kqn_eprx_small != NULL)
+               ep_free_rcvr (kqswnal_data.kqn_eprx_small);
+
+       if (kqswnal_data.kqn_eprx_large != NULL)
+               ep_free_rcvr (kqswnal_data.kqn_eprx_large);
 
+       /* NB ep_free_rcvr() returns only after we've freed off all receive
+        * buffers (see shutdown handling in kqswnal_requeue_rx()).  This
+        * means we must have completed any messages we passed to
+        * lib_parse() or kpr_fwd_start(). */
+
+       if (kqswnal_data.kqn_eptx != NULL)
+               ep_free_xmtr (kqswnal_data.kqn_eptx);
+
+       /* NB ep_free_xmtr() returns only after all outstanding transmits
+        * have called their callback... */
+       LASSERT(list_empty(&kqswnal_data.kqn_activetxds));
+#else
+       /* "Old" EKC just pretends to shutdown cleanly but actually
+        * provides no guarantees */
        if (kqswnal_data.kqn_eprx_small != NULL)
                ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
 
        if (kqswnal_data.kqn_eprx_large != NULL)
                ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
 
+       /* wait for transmits to complete */
+       while (!list_empty(&kqswnal_data.kqn_activetxds)) {
+               CWARN("waiting for active transmits to complete\n");
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(HZ);
+       }
+
        if (kqswnal_data.kqn_eptx != NULL)
                ep_free_large_xmtr (kqswnal_data.kqn_eptx);
+#endif
+       /**********************************************************************/
+       /* flag threads to terminate, wake them and wait for them to die */
+       kqswnal_data.kqn_shuttingdown = 2;
+       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+
+       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
+               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
+                      atomic_read (&kqswnal_data.kqn_nthreads));
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
 
        /**********************************************************************/
        /* No more threads.  No more portals, router or comms callbacks!
         * I control the horizontals and the verticals...
         */
 
+#if MULTIRAIL_EKC
+       LASSERT (list_empty (&kqswnal_data.kqn_readyrxds));
+       LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds));
+       LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds));
+#endif
+
        /**********************************************************************/
-       /* Complete any blocked forwarding packets with error
+       /* Complete any blocked forwarding packets, with error
         */
 
        while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
@@ -236,85 +247,97 @@ kqswnal_finalise (void)
                kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
                                                  kpr_fwd_desc_t, kprfd_list);
                list_del (&fwd->kprfd_list);
-               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
-       }
-
-       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
-       {
-               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
-                                                 kpr_fwd_desc_t, kprfd_list);
-               list_del (&fwd->kprfd_list);
-               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN);
        }
 
        /**********************************************************************/
-       /* Wait for router to complete any packets I sent her
-        */
+       /* finalise router and portals lib */
 
        kpr_deregister (&kqswnal_data.kqn_router);
 
+       if (do_lib_fini)
+               lib_fini (&kqswnal_lib);
 
        /**********************************************************************/
        /* Unmap message buffers and free all descriptors and buffers
         */
 
+#if MULTIRAIL_EKC
+       /* FTTB, we need to unmap any remaining mapped memory.  When
+        * ep_dvma_release() get fixed (and releases any mappings in the
+        * region), we can delete all the code from here -------->  */
+
+       for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
+               /* If ktx has a buffer, it got mapped; unmap now.  NB only
+                * the pre-mapped stuff is still mapped since all tx descs
+                * must be idle */
+
+               if (ktx->ktx_buffer != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_tx_nmh,
+                                      &ktx->ktx_ebuffer);
+       }
+
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+               /* If krx_kiov[0].kiov_page got allocated, it got mapped.  
+                * NB subsequent pages get merged */
+
+               if (krx->krx_kiov[0].kiov_page != NULL)
+                       ep_dvma_unload(kqswnal_data.kqn_ep,
+                                      kqswnal_data.kqn_ep_rx_nmh,
+                                      &krx->krx_elanbuffer);
+       }
+       /* <----------- to here */
+
+       if (kqswnal_data.kqn_ep_rx_nmh != NULL)
+               ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
+
+       if (kqswnal_data.kqn_ep_tx_nmh != NULL)
+               ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
+#else
        if (kqswnal_data.kqn_eprxdmahandle != NULL)
        {
-               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+               elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
                                  kqswnal_data.kqn_eprxdmahandle, 0,
                                  KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
                                  KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
 
-               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+               elan3_dma_release(kqswnal_data.kqn_ep->DmaState,
                                  kqswnal_data.kqn_eprxdmahandle);
        }
 
        if (kqswnal_data.kqn_eptxdmahandle != NULL)
        {
-               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+               elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
                                  kqswnal_data.kqn_eptxdmahandle, 0,
                                  KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
                                                      KQSW_NNBLK_TXMSGS));
 
-               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+               elan3_dma_release(kqswnal_data.kqn_ep->DmaState,
                                  kqswnal_data.kqn_eptxdmahandle);
        }
+#endif
 
-       if (kqswnal_data.kqn_txds != NULL)
-       {
-               int   i;
-
-               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
-               {
-                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+       while (kqswnal_data.kqn_txds != NULL) {
+               ktx = kqswnal_data.kqn_txds;
 
-                       if (ktx->ktx_buffer != NULL)
-                               PORTAL_FREE(ktx->ktx_buffer,
-                                           KQSW_TX_BUFFER_SIZE);
-               }
+               if (ktx->ktx_buffer != NULL)
+                       PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
 
-               PORTAL_FREE(kqswnal_data.kqn_txds,
-                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
-                                                    KQSW_NNBLK_TXMSGS));
+               kqswnal_data.kqn_txds = ktx->ktx_alloclist;
+               PORTAL_FREE(ktx, sizeof(*ktx));
        }
 
-       if (kqswnal_data.kqn_rxds != NULL)
-       {
-               int   i;
-               int   j;
-
-               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-               {
-                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+       while (kqswnal_data.kqn_rxds != NULL) {
+               int           i;
 
-                       for (j = 0; j < krx->krx_npages; j++)
-                               if (krx->krx_pages[j] != NULL)
-                                       __free_page (krx->krx_pages[j]);
-               }
+               krx = kqswnal_data.kqn_rxds;
+               for (i = 0; i < krx->krx_npages; i++)
+                       if (krx->krx_kiov[i].kiov_page != NULL)
+                               __free_page (krx->krx_kiov[i].kiov_page);
 
-               PORTAL_FREE(kqswnal_data.kqn_rxds,
-                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
-                                                   KQSW_NRXMSGS_LARGE));
+               kqswnal_data.kqn_rxds = krx->krx_alloclist;
+               PORTAL_FREE(krx, sizeof (*krx));
        }
 
        /* resets flags, pointers to NULL etc */
@@ -322,38 +345,45 @@ kqswnal_finalise (void)
 
        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
 
-       printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+       printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n",
                 atomic_read(&portal_kmemory));
 }
 
-static int __init
-kqswnal_initialise (void)
+static int
+kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
+                ptl_ni_limits_t *requested_limits, 
+                ptl_ni_limits_t *actual_limits)
 {
+#if MULTIRAIL_EKC
+       EP_RAILMASK       all_rails = EP_RAILMASK_ALL;
+#else
        ELAN3_DMA_REQUEST dmareq;
+#endif
        int               rc;
        int               i;
+       kqswnal_rx_t     *krx;
+       kqswnal_tx_t     *ktx;
        int               elan_page_idx;
+       ptl_process_id_t  my_process_id;
        int               pkmem = atomic_read(&portal_kmemory);
 
-       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+       LASSERT (nal == &kqswnal_api);
 
-       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+       if (nal->nal_refct != 0) {
+               if (actual_limits != NULL)
+                       *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits;
+               /* This module got the first ref */
+               PORTAL_MODULE_USE;
+               return (PTL_OK);
+       }
 
-       kqswnal_api.forward  = kqswnal_forward;
-       kqswnal_api.shutdown = kqswnal_shutdown;
-       kqswnal_api.yield    = kqswnal_yield;
-       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
-       kqswnal_api.lock     = kqswnal_lock;
-       kqswnal_api.unlock   = kqswnal_unlock;
-       kqswnal_api.nal_data = &kqswnal_data;
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
 
-       kqswnal_lib.nal_data = &kqswnal_data;
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
 
        /* ensure all pointers NULL etc */
        memset (&kqswnal_data, 0, sizeof (kqswnal_data));
 
-       kqswnal_data.kqn_cb = &kqswnal_lib;
-
        INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
@@ -368,137 +398,189 @@ kqswnal_initialise (void)
        spin_lock_init (&kqswnal_data.kqn_sched_lock);
        init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
 
-       spin_lock_init (&kqswnal_data.kqn_statelock);
+       /* Leave kqn_rpc_success zeroed */
+#if MULTIRAIL_EKC
+       kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED;
+#else
+       kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED;
+#endif
 
        /* pointers/lists/locks initialised */
        kqswnal_data.kqn_init = KQN_INIT_DATA;
+       
+#if MULTIRAIL_EKC
+       kqswnal_data.kqn_ep = ep_system();
+       if (kqswnal_data.kqn_ep == NULL) {
+               CERROR("Can't initialise EKC\n");
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
+       }
 
+       if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
+               CERROR("Can't get elan ID\n");
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
+       }
+#else
        /**********************************************************************/
        /* Find the first Elan device */
 
-       kqswnal_data.kqn_epdev = ep_device (0);
-       if (kqswnal_data.kqn_epdev == NULL)
+       kqswnal_data.kqn_ep = ep_device (0);
+       if (kqswnal_data.kqn_ep == NULL)
        {
                CERROR ("Can't get elan device 0\n");
-               return (-ENOMEM);
+               kqswnal_shutdown(nal);
+               return (PTL_IFACE_INVALID);
        }
+#endif
 
        kqswnal_data.kqn_nid_offset = 0;
-       kqswnal_data.kqn_nnodes     = ep_numnodes (kqswnal_data.kqn_epdev);
-       kqswnal_data.kqn_elanid     = ep_nodeid (kqswnal_data.kqn_epdev);
+       kqswnal_data.kqn_nnodes     = ep_numnodes (kqswnal_data.kqn_ep);
+       kqswnal_data.kqn_elanid     = ep_nodeid (kqswnal_data.kqn_ep);
        
        /**********************************************************************/
        /* Get the transmitter */
 
-       kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev);
+       kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep);
        if (kqswnal_data.kqn_eptx == NULL)
        {
                CERROR ("Can't allocate transmitter\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
        /**********************************************************************/
        /* Get the receivers */
 
-       kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
-                                                            EP_SVC_LARGE_PORTALS_SMALL,
-                                                            KQSW_EP_ENVELOPES_SMALL);
+       kqswnal_data.kqn_eprx_small = ep_alloc_rcvr (kqswnal_data.kqn_ep,
+                                                    EP_MSG_SVC_PORTALS_SMALL,
+                                                    KQSW_EP_ENVELOPES_SMALL);
        if (kqswnal_data.kqn_eprx_small == NULL)
        {
                CERROR ("Can't install small msg receiver\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
-       kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
-                                                            EP_SVC_LARGE_PORTALS_LARGE,
-                                                            KQSW_EP_ENVELOPES_LARGE);
+       kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep,
+                                                    EP_MSG_SVC_PORTALS_LARGE,
+                                                    KQSW_EP_ENVELOPES_LARGE);
        if (kqswnal_data.kqn_eprx_large == NULL)
        {
                CERROR ("Can't install large msg receiver\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
 
        /**********************************************************************/
-       /* Reserve Elan address space for transmit buffers */
-
+       /* Reserve Elan address space for transmit descriptors NB we may
+        * either send the contents of associated buffers immediately, or
+        * map them for the peer to suck/blow... */
+#if MULTIRAIL_EKC
+       kqswnal_data.kqn_ep_tx_nmh = 
+               ep_dvma_reserve(kqswnal_data.kqn_ep,
+                               KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                               EP_PERM_WRITE);
+       if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
+               CERROR("Can't reserve tx dma space\n");
+               kqswnal_shutdown(nal);
+               return (PTL_NO_SPACE);
+       }
+#else
         dmareq.Waitfn   = DDI_DMA_SLEEP;
         dmareq.ElanAddr = (E3_Addr) 0;
         dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
-        dmareq.Perm     = ELAN_PERM_REMOTEREAD;
+        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
 
-       rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState,
+       rc = elan3_dma_reserve(kqswnal_data.kqn_ep->DmaState,
                              KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
                              &dmareq, &kqswnal_data.kqn_eptxdmahandle);
        if (rc != DDI_SUCCESS)
        {
                CERROR ("Can't reserve rx dma space\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
-
+#endif
        /**********************************************************************/
        /* Reserve Elan address space for receive buffers */
-
+#if MULTIRAIL_EKC
+       kqswnal_data.kqn_ep_rx_nmh =
+               ep_dvma_reserve(kqswnal_data.kqn_ep,
+                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               EP_PERM_WRITE);
+       if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
+               CERROR("Can't reserve rx dma space\n");
+               kqswnal_shutdown(nal);
+               return (PTL_NO_SPACE);
+       }
+#else
         dmareq.Waitfn   = DDI_DMA_SLEEP;
         dmareq.ElanAddr = (E3_Addr) 0;
         dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
         dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
 
-       rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState,
+       rc = elan3_dma_reserve (kqswnal_data.kqn_ep->DmaState,
                                KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
                                KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
                                &dmareq, &kqswnal_data.kqn_eprxdmahandle);
        if (rc != DDI_SUCCESS)
        {
                CERROR ("Can't reserve rx dma space\n");
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               kqswnal_shutdown (nal);
+               return (PTL_NO_SPACE);
        }
-
+#endif
        /**********************************************************************/
        /* Allocate/Initialise transmit descriptors */
 
-       PORTAL_ALLOC(kqswnal_data.kqn_txds,
-                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
-       if (kqswnal_data.kqn_txds == NULL)
-       {
-               kqswnal_finalise ();
-               return (-ENOMEM);
-       }
-
-       /* clear flags, null pointers etc */
-       memset(kqswnal_data.kqn_txds, 0,
-              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       kqswnal_data.kqn_txds = NULL;
        for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
        {
                int           premapped_pages;
-               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
                int           basepage = i * KQSW_NTXMSGPAGES;
 
+               PORTAL_ALLOC (ktx, sizeof(*ktx));
+               if (ktx == NULL) {
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(ktx, 0, sizeof(*ktx));   /* NULL pointers; zero flags */
+               ktx->ktx_alloclist = kqswnal_data.kqn_txds;
+               kqswnal_data.kqn_txds = ktx;
+
                PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
                if (ktx->ktx_buffer == NULL)
                {
-                       kqswnal_finalise ();
-                       return (-ENOMEM);
+                       kqswnal_shutdown (nal);
+                       return (PTL_NO_SPACE);
                }
 
                /* Map pre-allocated buffer NOW, to save latency on transmit */
                premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
                                                        KQSW_TX_BUFFER_SIZE);
-
-               elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+#if MULTIRAIL_EKC
+               ep_dvma_load(kqswnal_data.kqn_ep, NULL, 
+                            ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, 
+                            kqswnal_data.kqn_ep_tx_nmh, basepage,
+                            &all_rails, &ktx->ktx_ebuffer);
+#else
+               elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
                                       kqswnal_data.kqn_eptxdmahandle,
                                       ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
                                       basepage, &ktx->ktx_ebuffer);
-
+#endif
                ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
                ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
 
                INIT_LIST_HEAD (&ktx->ktx_delayed_list);
 
+               ktx->ktx_state = KTX_IDLE;
+#if MULTIRAIL_EKC
+               ktx->ktx_rail = -1;             /* unset rail */
+#endif
                ktx->ktx_isnblk = (i >= KQSW_NTXMSGS);
                list_add_tail (&ktx->ktx_list, 
                               ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds :
@@ -507,24 +589,26 @@ kqswnal_initialise (void)
 
        /**********************************************************************/
        /* Allocate/Initialise receive descriptors */
-
-       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
-                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
-       if (kqswnal_data.kqn_rxds == NULL)
-       {
-               kqswnal_finalise ();
-               return (-ENOMEM);
-       }
-
-       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
-              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
-
+       kqswnal_data.kqn_rxds = NULL;
        elan_page_idx = 0;
        for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
        {
-               E3_Addr       elanaddr;
+#if MULTIRAIL_EKC
+               EP_NMD        elanbuffer;
+#else
+               E3_Addr       elanbuffer;
+#endif
                int           j;
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               PORTAL_ALLOC(krx, sizeof(*krx));
+               if (krx == NULL) {
+                       kqswnal_shutdown(nal);
+                       return (PTL_NO_SPACE);
+               }
+
+               memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
+               krx->krx_alloclist = kqswnal_data.kqn_rxds;
+               kqswnal_data.kqn_rxds = krx;
 
                if (i < KQSW_NRXMSGS_SMALL)
                {
@@ -540,27 +624,45 @@ kqswnal_initialise (void)
                LASSERT (krx->krx_npages > 0);
                for (j = 0; j < krx->krx_npages; j++)
                {
-                       krx->krx_pages[j] = alloc_page(GFP_KERNEL);
-                       if (krx->krx_pages[j] == NULL)
-                       {
-                               kqswnal_finalise ();
-                               return (-ENOMEM);
+                       struct page *page = alloc_page(GFP_KERNEL);
+                       
+                       if (page == NULL) {
+                               kqswnal_shutdown (nal);
+                               return (PTL_NO_SPACE);
                        }
 
-                       LASSERT(page_address(krx->krx_pages[j]) != NULL);
-
-                       elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState,
+                       krx->krx_kiov[j].kiov_page = page;
+                       LASSERT(page_address(page) != NULL);
+
+#if MULTIRAIL_EKC
+                       ep_dvma_load(kqswnal_data.kqn_ep, NULL,
+                                    page_address(page),
+                                    PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
+                                    elan_page_idx, &all_rails, &elanbuffer);
+                       
+                       if (j == 0) {
+                               krx->krx_elanbuffer = elanbuffer;
+                       } else {
+                               rc = ep_nmd_merge(&krx->krx_elanbuffer,
+                                                 &krx->krx_elanbuffer, 
+                                                 &elanbuffer);
+                               /* NB contiguous mapping */
+                               LASSERT(rc);
+                       }
+#else
+                       elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState,
                                              kqswnal_data.kqn_eprxdmahandle,
-                                             page_address(krx->krx_pages[j]),
+                                             page_address(page),
                                              PAGE_SIZE, elan_page_idx,
-                                             &elanaddr);
-                       elan_page_idx++;
-
+                                             &elanbuffer);
                        if (j == 0)
-                               krx->krx_elanaddr = elanaddr;
+                               krx->krx_elanbuffer = elanbuffer;
+
+                       /* NB contiguous mapping */
+                       LASSERT (elanbuffer == krx->krx_elanbuffer + j * PAGE_SIZE);
+#endif
+                       elan_page_idx++;
 
-                       /* NB we assume a contiguous  */
-                       LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE);
                }
        }
        LASSERT (elan_page_idx ==
@@ -570,45 +672,51 @@ kqswnal_initialise (void)
        /**********************************************************************/
        /* Network interface ready to initialise */
 
-        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
-        if (rc != 0)
+       my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid);
+       my_process_id.pid = requested_pid;
+
+       rc = lib_init(&kqswnal_lib, nal, my_process_id,
+                     requested_limits, actual_limits);
+        if (rc != PTL_OK)
        {
-               CERROR ("PtlNIInit failed %d\n", rc);
-               kqswnal_finalise ();
-               return (-ENOMEM);
+               CERROR ("lib_init failed %d\n", rc);
+               kqswnal_shutdown (nal);
+               return (rc);
        }
 
-       kqswnal_data.kqn_init = KQN_INIT_PTL;
+       kqswnal_data.kqn_init = KQN_INIT_LIB;
 
        /**********************************************************************/
        /* Queue receives, now that it's OK to run their completion callbacks */
 
-       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
-       {
-               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
+       for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
                /* NB this enqueue can allocate/sleep (attr == 0) */
+               krx->krx_state = KRX_POSTED;
+#if MULTIRAIL_EKC
+               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
+                                     &krx->krx_elanbuffer, 0);
+#else
                rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
-                                     krx->krx_elanaddr,
+                                     krx->krx_elanbuffer,
                                      krx->krx_npages * PAGE_SIZE, 0);
-               if (rc != 0)
+#endif
+               if (rc != EP_SUCCESS)
                {
                        CERROR ("failed ep_queue_receive %d\n", rc);
-                       kqswnal_finalise ();
-                       return (-ENOMEM);
+                       kqswnal_shutdown (nal);
+                       return (PTL_FAIL);
                }
        }
 
        /**********************************************************************/
        /* Spawn scheduling threads */
-       for (i = 0; i < smp_num_cpus; i++)
-       {
+       for (i = 0; i < num_online_cpus(); i++) {
                rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
                if (rc != 0)
                {
                        CERROR ("failed to spawn scheduling thread: %d\n", rc);
-                       kqswnal_finalise ();
-                       return (rc);
+                       kqswnal_shutdown (nal);
+                       return (PTL_FAIL);
                }
        }
 
@@ -617,31 +725,74 @@ kqswnal_initialise (void)
        rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
        CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
 
-       rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL);
+       rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL);
        if (rc != 0) {
                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-               kqswnal_finalise ();
-               return (rc);
+               kqswnal_shutdown (nal);
+               return (PTL_FAIL);
        }
 
-       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
        kqswnal_data.kqn_init = KQN_INIT_ALL;
 
-       printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+       printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d "
               "(Routing %s, initial mem %d)\n", 
               kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
               kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
               pkmem);
 
-       return (0);
+       return (PTL_OK);
 }
 
+void __exit
+kqswnal_finalise (void)
+{
+#if CONFIG_SYSCTL
+       if (kqswnal_tunables.kqn_sysctl != NULL)
+               unregister_sysctl_table (kqswnal_tunables.kqn_sysctl);
+#endif
+       PtlNIFini(kqswnal_ni);
+
+       ptl_unregister_nal(QSWNAL);
+}
+
+static int __init
+kqswnal_initialise (void)
+{
+       int   rc;
+
+       kqswnal_api.nal_ni_init = kqswnal_startup;
+       kqswnal_api.nal_ni_fini = kqswnal_shutdown;
+
+       /* Initialise dynamic tunables to defaults once only */
+       kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS;
+       kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS;
+       
+       rc = ptl_register_nal(QSWNAL, &kqswnal_api);
+       if (rc != PTL_OK) {
+               CERROR("Can't register QSWNAL: %d\n", rc);
+               return (-ENOMEM);               /* or something... */
+       }
 
-MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>");
-MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00");
+       /* Pure gateways, and the workaround for 'EKC blocks forever until
+        * the service is active' want the NAL started up at module load
+        * time... */
+       rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni);
+       if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+               ptl_unregister_nal(QSWNAL);
+               return (-ENODEV);
+       }
+
+#if CONFIG_SYSCTL
+        /* Press on regardless even if registering sysctl doesn't work */
+        kqswnal_tunables.kqn_sysctl = 
+               register_sysctl_table (kqswnal_top_ctl_table, 0);
+#endif
+       return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01");
 MODULE_LICENSE("GPL");
 
 module_init (kqswnal_initialise);
 module_exit (kqswnal_finalise);
-
-EXPORT_SYMBOL (kqswnal_ni);