Whamcloud - gitweb
b=7982
authorpjkirner <pjkirner>
Wed, 21 Sep 2005 02:08:05 +0000 (02:08 +0000)
committerpjkirner <pjkirner>
Wed, 21 Sep 2005 02:08:05 +0000 (02:08 +0000)
* Portals LND

16 files changed:
lnet/klnds/ptllnd/Makefile.in
lnet/klnds/ptllnd/README [new file with mode: 0644]
lnet/klnds/ptllnd/autoMakefile.am
lnet/klnds/ptllnd/ptllnd.c
lnet/klnds/ptllnd/ptllnd.h
lnet/klnds/ptllnd/ptllnd_cb.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_modparams.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_peer.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_rx_buf.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_tx.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_wire.h [new file with mode: 0644]
lnet/klnds/ptllnd/wirecheck.c [new file with mode: 0644]
lnet/libcfs/nidstrings.c
lnet/tests/ut.h
lnet/tests/ut_cli.c
lnet/tests/ut_srv.c

index ae5477c..2435b58 100755 (executable)
@@ -1,7 +1,7 @@
 MODULES := kptllnd
 
-kptllnd-objs := ptllnd.o
+EXTRA_POST_CFLAGS := @PTLLNDCPPFLAGS@
 
-EXTRA_PRE_CFLAGS := @PTLLNDCPPFLAGS@
+kptllnd-objs := ptllnd_rx_buf.o ptllnd_tx.o ptllnd.o ptllnd_cb.o ptllnd_modparams.o ptllnd_peer.o
 
 @INCLUDE_RULES@
diff --git a/lnet/klnds/ptllnd/README b/lnet/klnds/ptllnd/README
new file mode 100644 (file)
index 0000000..d6cfc37
--- /dev/null
@@ -0,0 +1,47 @@
+1. This version of the Portals LND is intended to work on the Cray XT3 using
+   Cray Portals as a network transport.
+
+2. To enable the building of the Portals LND (ptllnd.ko) configure with the
+   following option:
+   ./configure --with-portals=<path-to-portals-headers>
+
+3. The following configuration options are supported
+
+        ntx:
+            The total number of message descritprs
+
+        concurrent_peers:
+            The maximum number of conncurent peers.  Peers attemting
+            to connect beyond the maximum will not be allowd.
+
+        peer_hash_table_size:
+            The number of hash table slots for the peers. This number
+            should scale with concurrent_peers.
+
+        cksum:
+            Set to non-zero to enable message (not RDMA) checksums for
+            outgoing packets.   Incoming packets will always be checksumed
+            if necssary, independnt of this value.
+
+        timeout:
+            The amount of time a request can linger in a peers active
+            queue, before the peer is considered dead.  Units: seconds.
+
+        portal:
+            The portal ID to use for the ptllnd traffic.
+
+        rxb_npages:
+            The number of pages in a RX Buffer.
+
+        credits:
+            The maximum total number of concurrent sends that are
+            outstanding at any given instant.
+
+        peercredits:
+            The maximum number of concurrent sends that are
+            outstanding to a single piere at any given instant.
+
+        max_immd_size:
+            The maximum immedate message size.  This MUST be
+            the same on all nodes in a cluster.  A peer connecting
+            with a diffrent max_immd_size will be rejected.
index f2f6175..2ac93ab 100755 (executable)
@@ -10,4 +10,4 @@ endif
 endif
 
 MOSTLYCLEANFILES = *.o *.ko *.mod.c
-DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h 
+DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h ptllnd_wire.h
index 3688f5d..caed511 100755 (executable)
 
 #include "ptllnd.h"
 
-lnet_handle_ni_t nih;\r
-\r
-\r
+
+lnd_t               kptllnd_lnd = {
+        .lnd_type       = PTLLND,
+        .lnd_startup    = kptllnd_startup,
+        .lnd_shutdown   = kptllnd_shutdown,
+        .lnd_ctl        = kptllnd_ctl,
+        .lnd_send       = kptllnd_send,
+        .lnd_recv       = kptllnd_recv,
+};
+
+kptl_data_t             kptllnd_data;
+kptl_stats_t            kptllnd_stats;
+
+void kptllnd_shutdown (lnet_ni_t *ni);
+
+void ptllnd_assert_wire_constants (void)
+{
+        /* TBD - auto generated */
+}
+
+__u32
+kptllnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+void
+kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob)
+{
+        msg->ptlm_type = type;
+        msg->ptlm_nob  = offsetof(kptl_msg_t, ptlm_u) + body_nob;
+}
+
+void
+kptllnd_msg_pack(
+        kptl_msg_t *msg,
+        int credits,
+        lnet_nid_t dstnid,
+        __u64 dststamp,
+        __u64 seq,
+        kptl_data_t *kptllnd_data)
+{
+        msg->ptlm_magic    = PTLLND_MSG_MAGIC;
+        msg->ptlm_version  = PTLLND_MSG_VERSION;
+        /* msg->ptlm_type  Filled in kptllnd_init_msg()  */
+        msg->ptlm_credits  = credits;
+        /* msg->ptlm_nob   Filled in kptllnd_init_msg()  */
+        msg->ptlm_cksum    = 0;
+        msg->ptlm_srcnid   = kptllnd_data->kptl_ni->ni_nid;
+        msg->ptlm_srcstamp = kptllnd_data->kptl_incarnation;
+        msg->ptlm_dstnid   = dstnid;
+        msg->ptlm_dststamp = dststamp;
+        msg->ptlm_seq      = seq;
+        
+        if (*kptllnd_tunables.kptl_cksum) {
+                /* NB ptlm_cksum zero while computing cksum */
+                msg->ptlm_cksum = kptllnd_cksum(msg, msg->ptlm_nob);
+        }
+}
+
+int
+kptllnd_msg_unpack(kptl_msg_t *msg, int nob,kptl_data_t *kptllnd_data)
+{
+        const int hdr_size = offsetof(kptl_msg_t, ptlm_u);
+        __u32     msg_cksum;
+        int       flip;
+        int       msg_nob;
+
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Very Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        /*
+         * Determine if we need to flip
+         */
+        if (msg->ptlm_magic == PTLLND_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ptlm_magic);
+                return -EPROTO;
+        }
+
+        if (msg->ptlm_version !=
+            (flip ? __swab16(PTLLND_MSG_VERSION) : PTLLND_MSG_VERSION)) {
+                CERROR("Bad version: got %d expected %d\n",
+                        msg->ptlm_version,PTLLND_MSG_VERSION);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short header: got %d, wanted at least %d\n",
+                        nob, hdr_size);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->ptlm_nob) : msg->ptlm_nob;
+        if (nob != msg_nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with
+         * 1) ptlm_cksum zero and
+         * 2) BEFORE anything gets modified/flipped
+         */
+        msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum;
+        msg->ptlm_cksum = 0;
+        if (msg_cksum != 0){
+                STAT_UPDATE(kps_incoming_checksums_calculated);
+                if( msg_cksum != kptllnd_cksum(msg, msg_nob) ) {
+                        STAT_UPDATE(kps_incoming_checksums_invalid);
+                        CERROR("Bad checksum\n");
+                        return -EPROTO;
+                }
+        }
+
+        /* Restore the checksum */
+        msg->ptlm_cksum = msg_cksum;
+
+        if(flip){
+                 /* leave magic unflipped as a clue to peer endianness */
+                __swab16s(&msg->ptlm_version);
+                /* These two are 1 byte long so we don't swap them
+                   But check this assumtion*/
+                CLASSERT (sizeof(msg->ptlm_type) == 1);
+                CLASSERT (sizeof(msg->ptlm_credits) == 1);
+                msg->ptlm_nob = msg_nob;
+                __swab64s(&msg->ptlm_srcnid);
+                __swab64s(&msg->ptlm_srcstamp);
+                __swab64s(&msg->ptlm_dstnid);
+                __swab64s(&msg->ptlm_dststamp);
+                __swab64s(&msg->ptlm_seq);
+
+                switch(msg->ptlm_type)
+                {
+                        case PLTLND_MSG_TYPE_PUT:
+                        case PTLLND_MSG_TYPE_GET:
+                                __swab64s(&msg->ptlm_u.req.kptlrm_matchbits);
+                                break;
+                        case PTLLND_MSG_TYPE_IMMEDIATE:
+                        case PTLLND_MSG_TYPE_NOOP:
+                                /* Do nothing */
+                                break;
+                        case PTLLND_MSG_TYPE_HELLO:
+                                __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits);
+                                __swab32s(&msg->ptlm_u.hello.kptlhm_max_immd_size);
+                                break;
+                        default:
+                                CERROR("Bad message type: %d\n", msg->ptlm_type);
+                                return -EPROTO;
+                }
+        }
+
+        /*
+         * Src nid can not be ANY
+         */
+        if (msg->ptlm_srcnid == PTL_NID_ANY) {
+                CERROR("Bad src nid: "LPX64"\n", msg->ptlm_srcnid);
+                return -EPROTO;
+        }
+
+        return 0;
+}
+
+
+
+int
+kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+        struct portal_ioctl_data *data = arg;
+        int          rc = -EINVAL;
+        kptl_data_t *kptllnd_data = ni->ni_data;
+
+        PJK_UT_MSG(">>> kptllnd_ctl cmd=%u arg=%p\n",cmd,arg);
+
+        /*
+         * Validate that the context block is actually
+         * pointing to this interface
+         */
+        LASSERT (ni == kptllnd_data->kptl_ni);
+
+        switch(cmd) {
+        case IOC_PORTAL_DEL_PEER: {
+                rc = kptllnd_peer_del (kptllnd_data,data->ioc_nid);
+                break;
+        }
+        /*
+         * Not Supported - This is Legacy stuff
+        case IOC_PORTAL_GET_PEER:
+        case IOC_PORTAL_ADD_PEER:
+        case IOC_PORTAL_GET_CONN:
+        case IOC_PORTAL_CLOSE_CONNECTION:
+        case IOC_PORTAL_REGISTER_MYNID:
+        */
+        default:
+                CERROR("Unsupported IOCTL command %d\n",cmd);
+                rc=-EINVAL;
+                break;
+        }
+        PJK_UT_MSG("<<< kptllnd_ctl rc=%d\n",rc);
+        return rc;
+}
+
+void
+kptllnd_posted_object_setup(
+        kptl_posted_object_t* posted_obj,
+        kptl_data_t *kptllnd_data,
+        int type)
+{
+        /*
+         * Setup back pointer to LND instance data
+         */
+        posted_obj->po_kptllnd_data = kptllnd_data;
+
+        /*
+         * Setup descriptor type
+         */
+        posted_obj->po_flags.pof_type = type;
+}
+
+int
+kptllnd_startup (lnet_ni_t *ni)
+{
+        int             rc;
+        int             i;
+        struct timeval  tv;
+        kptl_data_t    *kptllnd_data;
+        ptl_err_t       ptl_rc;
+
+
+        PJK_UT_MSG(">>>\n");
+
+        LASSERT (ni->ni_lnd == &kptllnd_lnd);
+
+        PORTAL_ALLOC (kptllnd_data,sizeof(*kptllnd_data));
+        if (kptllnd_data == NULL){
+                CERROR ("Failed to allocate memory for PTLLND context\n");
+                return -ENOMEM;
+        }
+
+        /*
+         * zero pointers, flags etc
+         * put everything into a known state.
+         */
+        memset (kptllnd_data, 0, sizeof (*kptllnd_data));
+        kptllnd_data->kptl_eqh = PTL_INVALID_HANDLE;
+        kptllnd_data->kptl_nih = PTL_INVALID_HANDLE;
+
+        /*
+         * Uptick the module reference count
+         */
+        PORTAL_MODULE_USE;
+
+        /*
+         * Setup pointers between the ni and context data block
+         */
+        kptllnd_data->kptl_ni = ni;
+        ni->ni_data = kptllnd_data;
+
+        /*
+         * Setup Credits
+         */
+        ni->ni_maxtxcredits = *kptllnd_tunables.kptl_credits;
+        ni->ni_peertxcredits = *kptllnd_tunables.kptl_peercredits;
+
+
+        /*
+         * Initialize the Network interface instance
+         * We use the default because we don't have any
+         * way to choose a better interface.
+         * Requested and actual limits are ignored.
+         */
+        ptl_rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL,
+                           &kptllnd_data->kptl_nih);
+
+        /*
+         * Note: PTL_IFACE_DUP simply means that the requested
+         * interface was already inited and that we're sharing it.
+         * Which is ok.
+         */
+        if (ptl_rc != PTL_OK && ptl_rc != PTL_IFACE_DUP){
+                CERROR ("PtlNIInit: error %d\n", ptl_rc);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        ptl_rc = PtlEQAlloc(
+                kptllnd_data->kptl_nih,
+                8,                      /* We use callback - no need for max */
+                kptllnd_eq_callback,    /* handler callback */
+                &kptllnd_data->kptl_eqh);   /* output handle */
+        if(ptl_rc != 0) {
+                CERROR("PtlEQAlloc failed %d\n",ptl_rc);
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        /*
+         * Fetch the lower NID
+         */
+        if(ptl_rc != PtlGetId(kptllnd_data->kptl_nih,&kptllnd_data->kptl_portals_id)){
+                CERROR ("PtlGetID: error %d\n", ptl_rc);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        PJK_UT_MSG("lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
+
+        /*
+         * Create the new NID.  Based on the LND network type
+         * and the lower ni's address data.
+         */
+        ni->ni_nid = ptl2lnetnid(kptllnd_data,kptllnd_data->kptl_portals_id.nid);
+
+        PJK_UT_MSG("ptl  nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid);
+        PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
+
+        CDEBUG(D_INFO,"ptl  nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid);
+        CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid);
+
+        /*
+         * Initialized the incarnation
+         */
+        do_gettimeofday(&tv);
+        kptllnd_data->kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        PJK_UT_MSG("Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
+        CDEBUG(D_INFO,"Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
+
+        /*
+         * Setup the sched locks/lists/waitq
+         */
+        spin_lock_init (&kptllnd_data->kptl_sched_lock);
+        init_waitqueue_head (&kptllnd_data->kptl_sched_waitq);
+        INIT_LIST_HEAD (&kptllnd_data->kptl_sched_txq);
+        INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxq);
+        INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxbq);
+
+        /*
+         * Setup the tx locks/lists/waitq
+         */
+        spin_lock_init (&kptllnd_data->kptl_tx_lock);
+        INIT_LIST_HEAD (&kptllnd_data->kptl_idle_txs);
+        INIT_LIST_HEAD (&kptllnd_data->kptl_idle_nblk_txs);
+        init_waitqueue_head(&kptllnd_data->kptl_idle_tx_waitq);
+
+        /*
+         * Allocate and setup the peer hash table
+         */
+        PJK_UT_MSG("Allocate Peer Hash Table\n");
+        rwlock_init(&kptllnd_data->kptl_peer_rw_lock);
+        kptllnd_data->kptl_peer_hash_size = *kptllnd_tunables.kptl_peer_hash_table_size;
+        INIT_LIST_HEAD(&kptllnd_data->kptl_canceled_peers);
+        PORTAL_ALLOC (kptllnd_data->kptl_peers,
+                      sizeof (struct list_head) * kptllnd_data->kptl_peer_hash_size);
+        if (kptllnd_data->kptl_peers == NULL) {
+                CERROR("Failed to allocate space for peer hash table size=%d\n",
+                        kptllnd_data->kptl_peer_hash_size);
+                rc = -ENOMEM;
+                goto failed;
+        }
+        for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kptllnd_data->kptl_peers[i]);
+
+        /* lists/ptrs/locks initialised */
+        kptllnd_data->kptl_init = PTLLND_INIT_DATA;
+
+        /*****************************************************/
+
+        /*
+         * Start the scheduler threads for handling incoming
+         * requests.  No need to advance the state because
+         * this will be automatically cleaned up now that PTLNAT_INIT_DATA
+         * state has been entered
+         */
+        PJK_UT_MSG("starting %d scheduler threads\n",PTLLND_N_SCHED);
+        for (i = 0; i < PTLLND_N_SCHED; i++) {
+                rc = kptllnd_thread_start (
+                        kptllnd_scheduler,
+                        i,
+                        kptllnd_data);
+                if (rc != 0) {
+                        CERROR("Can't spawn scheduler[%d]: %d\n", i, rc);
+                        goto failed;
+                }
+        }
+
+        /*
+         * Allocate space for the tx descriptors
+         * (Note we don't need to advance the init state
+         * because we'll use the pointer being NULL as a sentry
+         * to know that we have to clean this up
+         */
+        PJK_UT_MSG("Allocate TX Descriptor array\n");
+        PORTAL_ALLOC (kptllnd_data->kptl_tx_descs,
+                      PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+        if (kptllnd_data->kptl_tx_descs == NULL){
+                CERROR ("Can't allocate space for TX Descriptor array count=%d\n",
+                        PTLLND_TX_MSGS());
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        /*
+         * Now setup the tx descriptors
+         */
+        rc = kptllnd_setup_tx_descs(kptllnd_data);
+        if (rc != 0) {
+                CERROR ("Can\'t setup tx descs: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag TX descs initialised */
+        kptllnd_data->kptl_init = PTLLND_INIT_TXD;
+
+        /*****************************************************/
+
+
+        kptllnd_rx_buffer_pool_init(&kptllnd_data->kptl_rx_buffer_pool);
+
+        /* flag rx descs initialised */
+        kptllnd_data->kptl_init = PTLLND_INIT_RXD;
+
+        /*****************************************************/
+
+
+        kptllnd_data->kptl_rx_cache = cfs_mem_cache_create (
+                "ptllnd_rx",
+                sizeof(kptl_rx_t) + *kptllnd_tunables.kptl_max_immd_size,
+                0, /* offset */
+                0, /* flags */
+                NULL,NULL); /* CTOR/DTOR */
+        if( kptllnd_data->kptl_rx_cache == 0 ){
+                CERROR("Can't create slab for RX descriptrs\n");
+                goto failed;
+        }
+
+        rc = kptllnd_rx_buffer_pool_reserve(
+                        &kptllnd_data->kptl_rx_buffer_pool,
+                        kptllnd_data,
+                        *kptllnd_tunables.kptl_concurrent_peers);
+        if( rc != 0) {
+                CERROR("Can't reserve RX Buffer pool: %d\n",rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kptllnd_data->kptl_init = PTLLND_INIT_ALL;
+
+
+        /*****************************************************/
+
+        PJK_UT_MSG("<<< kptllnd_startup SUCCESS\n");
+        return 0;
+
+ failed:
+        CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n",rc);
+        kptllnd_shutdown (ni);
+        PJK_UT_MSG("<<< kptllnd_startup rc=%d\n",rc);
+        return rc;
+}
+
+void
+kptllnd_shutdown (lnet_ni_t *ni)
+{
+        int             i;
+        kptl_data_t    *kptllnd_data = ni->ni_data;
+
+        PJK_UT_MSG(">>> kptllnd_shutdown\n");
+
+        /*
+         * Validate that the context block is actually
+         * pointing to this interface
+         */
+        LASSERT (ni == kptllnd_data->kptl_ni);
+
+        CDEBUG(D_MALLOC, "before LND cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+
+        /*
+         * Now depending on where we are in the initialization
+         * cleanup the context block
+         */
+        switch (kptllnd_data->kptl_init) {
+
+        case PTLLND_INIT_ALL:
+        case PTLLND_INIT_RXD:
+                PJK_UT_MSG("PTLLND_INIT_RXD\n");
+
+                kptllnd_rx_buffer_pool_fini(
+                        &kptllnd_data->kptl_rx_buffer_pool);
+
+                LASSERT(list_empty(&kptllnd_data->kptl_sched_rxq));
+                LASSERT(list_empty(&kptllnd_data->kptl_sched_rxbq));
+
+                /* fall through */
+        case PTLLND_INIT_TXD:
+                PJK_UT_MSG("PTLLND_INIT_TXD\n");
+
+                /*
+                 * If there were peers started up then
+                 * clean them up.
+                 */
+                if( atomic_read(&kptllnd_data->kptl_npeers) != 0) {
+                        PJK_UT_MSG("Deleting %d peers\n",atomic_read(&kptllnd_data->kptl_npeers));
+
+                        /* nuke all peers */
+                        kptllnd_peer_del(kptllnd_data,PTL_NID_ANY);
+
+                        i = 2;
+                        while (atomic_read (&kptllnd_data->kptl_npeers) != 0) {
+
+                                i++;
+                                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                                       "Waiting for %d peers to terminate\n",
+                                       atomic_read (&kptllnd_data->kptl_npeers));
+                                PJK_UT_MSG("Waiting for %d peers to terminate\n",
+                                        atomic_read (&kptllnd_data->kptl_npeers));
+                                cfs_pause(cfs_time_seconds(1));
+                        }
+                }
+
+                LASSERT(list_empty(&kptllnd_data->kptl_canceled_peers));
+                PJK_UT_MSG("All peers deleted\n");
+
+                /*
+                 * Set the shutdown flag
+                 */
+                kptllnd_data->kptl_shutdown = 1;
+
+                /*
+                 * First thing we do is shutdown the scheduler threads
+                 * It makes cleanup easier to not have to worry about races
+                 * with N other threads.
+                 *
+                 * Also this is safe no matter the kptl_init state
+                 * because it is a nop because kptl_nthreads==0
+                 * if we are not in the right state.
+                 */
+                if(atomic_read (&kptllnd_data->kptl_nthreads) != 0){
+                        PJK_UT_MSG("Stopping %d threads\n",atomic_read(&kptllnd_data->kptl_nthreads));
+                        /*
+                         * Wake up all the schedulers
+                         */
+                        wake_up_all (&kptllnd_data->kptl_sched_waitq);
+
+                        i = 2;
+                        while (atomic_read (&kptllnd_data->kptl_nthreads) != 0) {
+                                i++;
+                                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                                       "Waiting for %d threads to terminate\n",
+                                       atomic_read (&kptllnd_data->kptl_nthreads));
+                                PJK_UT_MSG("Waiting for %d threads to terminate\n",
+                                        atomic_read (&kptllnd_data->kptl_nthreads));
+                                cfs_pause(cfs_time_seconds(1));
+                        }
+
+                }
+                PJK_UT_MSG("All Threads stopped\n");
+
+
+                LASSERT(list_empty(&kptllnd_data->kptl_sched_txq));
+
+                kptllnd_cleanup_tx_descs(kptllnd_data);
+
+                /* fall through */
+        case PTLLND_INIT_DATA:
+
+                PJK_UT_MSG("PTLLND_INIT_DATA\n");
+
+                LASSERT (atomic_read(&kptllnd_data->kptl_npeers) == 0);
+                LASSERT (kptllnd_data->kptl_peers != NULL);
+                for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++) {
+                        LASSERT (list_empty (&kptllnd_data->kptl_peers[i]));
+                }
+                /*
+                 * Nothing here now, but libcfs might soon require
+                 * us to explicitly destroy wait queues and semaphores
+                 * that would be done here
+                 */
+
+                /* fall through */
+
+        case PTLLND_INIT_NOTHING:
+                PJK_UT_MSG("PTLLND_INIT_NOTHING\n");
+                break;
+        }
+
+        /*
+         * There are a number of things that can be done
+         * outside the state machine, because the construction
+         * (or lack thereof) can be determined directly from
+         * the pointer or handle itself.
+         * Clean these things up here
+         */
+
+        /*
+         * Cleanup the portals EQ
+         */
+        if(!PtlHandleIsEqual(kptllnd_data->kptl_eqh,PTL_INVALID_HANDLE))
+                PtlEQFree(kptllnd_data->kptl_eqh);
+
+        /*
+         * release the portals ni handle
+         */
+        if(!PtlHandleIsEqual(kptllnd_data->kptl_nih,PTL_INVALID_HANDLE))
+                PtlNIFini(kptllnd_data->kptl_nih);
+
+        /*
+         *  Free the tx descriptors
+         */
+        if (kptllnd_data->kptl_tx_descs != NULL)
+                PORTAL_FREE(kptllnd_data->kptl_tx_descs,
+                        PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+
+        /*
+         * Cleanup the RX descriptor slab
+         */
+        if (kptllnd_data->kptl_rx_cache != NULL)
+                cfs_mem_cache_destroy( kptllnd_data->kptl_rx_cache);
+
+        /*
+         * Cleanup the peer hash table
+         */
+        if (kptllnd_data->kptl_peers != NULL){
+                PORTAL_FREE (kptllnd_data->kptl_peers,
+                             sizeof (struct list_head) *
+                             kptllnd_data->kptl_peer_hash_size);
+        }
+
+        /*
+         * And free the context block
+         */
+        PORTAL_FREE(kptllnd_data,sizeof(*kptllnd_data));
+
+        CDEBUG(D_MALLOC, "after LND cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+
+        PORTAL_MODULE_UNUSE;
+        PJK_UT_MSG("<<<\n");
+}
+
+int __init
+kptllnd_module_init (void)
+{
+        int    rc;
+
+        PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
+
+        /*
+         * Display the module parameters
+         */
+        CDEBUG(D_INFO,"ntx = %d\n",*kptllnd_tunables.kptl_ntx);
+        CDEBUG(D_INFO,"ntx_nblk = %d\n",*kptllnd_tunables.kptl_ntx_nblk);
+        CDEBUG(D_INFO,"concurrent_peers = %d\n",*kptllnd_tunables.kptl_concurrent_peers);
+        CDEBUG(D_INFO,"cksum = %d\n",*kptllnd_tunables.kptl_cksum);
+        CDEBUG(D_INFO,"portal = %d\n",*kptllnd_tunables.kptl_portal);
+        CDEBUG(D_INFO,"timeout = %d (seconds)\n",*kptllnd_tunables.kptl_timeout);
+        CDEBUG(D_INFO,"rxb_npages = %d\n",*kptllnd_tunables.kptl_rxb_npages);
+        CDEBUG(D_INFO,"credits = %d\n",*kptllnd_tunables.kptl_credits);
+        CDEBUG(D_INFO,"peercredits = %d\n",*kptllnd_tunables.kptl_peercredits);
+        CDEBUG(D_INFO,"max_immd_size = %d\n",*kptllnd_tunables.kptl_max_immd_size);
+
+        ptllnd_assert_wire_constants();
+
+        rc = kptllnd_tunables_init();
+        if (rc != 0)
+                return rc;
+
+        lnet_register_lnd(&kptllnd_lnd);
+
+        PJK_UT_MSG("<<<\n");
+        return 0;
+}
+
 void __exit
 kptllnd_module_fini (void)
 {
+
         PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
-        PtlNIFini(nih);
+        lnet_unregister_lnd(&kptllnd_lnd);
+        kptllnd_tunables_fini();
+        kpttllnd_get_stats();
         PJK_UT_MSG("<<<\n");
 }
 
-int __init
-kptllnd_module_init (void)
+#define DO_TYPE(x) case x: return #x;
+
+const char *get_ev_type_string(int type)
 {
-        int    rc = 0;\r
-        lnet_process_id_t portals_id;
-        PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);\r
-        \r
-        PJK_UT_MSG("PtlNIInit\n");
-        rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL, &nih);\r
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP){
-                /*CERROR ("PtlNIInit: error %d\n", rc);*/
-                goto failed;
+        switch(type)
+        {
+                DO_TYPE(PTL_EVENT_GET_START);
+                DO_TYPE(PTL_EVENT_GET_END);
+                DO_TYPE(PTL_EVENT_PUT_START);
+                DO_TYPE(PTL_EVENT_PUT_END);
+                DO_TYPE(PTL_EVENT_REPLY_START);
+                DO_TYPE(PTL_EVENT_REPLY_END);
+                DO_TYPE(PTL_EVENT_ACK);
+                DO_TYPE(PTL_EVENT_SEND_START);
+                DO_TYPE(PTL_EVENT_SEND_END);
+                DO_TYPE(PTL_EVENT_UNLINK);
+        default:
+                return "";
         }
-                \r
-        PJK_UT_MSG("PtlGetId\n");
-        if(rc != PtlGetId(nih,&portals_id)){
-                /*CERROR ("PtlGetID: error %d\n", rc);*/
-        }else{\r
-                PJK_UT_MSG("ptl nid=" LPX64 "\n",portals_id.nid);
-        }                
-      
-failed:                
-        PJK_UT_MSG("<<<\n");
-        return rc;
-}\r
-\r
+}
+
+const char *get_msg_type_string(int type)
+{
+        switch(type)
+        {
+                DO_TYPE(PTLLND_MSG_TYPE_INVALID);
+                DO_TYPE(PLTLND_MSG_TYPE_PUT);
+                DO_TYPE(PTLLND_MSG_TYPE_GET);
+                DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE);
+                DO_TYPE(PTLLND_MSG_TYPE_HELLO);
+        default:
+                return "";
+        }
+}
+
+#define LOGSTAT(x) PJK_UT_MSG_ALWAYS("%30.30s %d\n",#x,kptllnd_stats.x);
+
+kptl_stats_t* kpttllnd_get_stats(void)
+{
+        LOGSTAT(kps_incoming_checksums_calculated);
+        LOGSTAT(kps_incoming_checksums_invalid);
+        LOGSTAT(kps_cleaning_caneled_peers);
+        LOGSTAT(kps_checking_buckets);
+        LOGSTAT(kps_too_many_peers);
+        LOGSTAT(kps_peers_created);
+        LOGSTAT(kps_no_credits);
+        LOGSTAT(kps_saving_last_credit);
+        LOGSTAT(kps_rx_allocated);
+        LOGSTAT(kps_rx_released);
+        LOGSTAT(kps_rx_allocation_failed);
+        LOGSTAT(kps_tx_allocated);
+        LOGSTAT(kps_tx_released);
+        LOGSTAT(kpt_tx_allocation_failed);
+        LOGSTAT(kpx_recv_delayed);
+        LOGSTAT(kpx_send_routing);
+        LOGSTAT(kpx_send_target_is_router);
+
+        return &kptllnd_stats;
+}
+
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Portals LND v1.00");\r
-/*MODULE_LICENSE("GPL");*/
+MODULE_DESCRIPTION("Kernel Portals LND v1.00");
+MODULE_LICENSE("GPL");
 
 module_init(kptllnd_module_init);
 module_exit(kptllnd_module_fini);
index 4c302d0..2469d4c 100755 (executable)
 #include <linux/list.h>
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
-#include <linux/random.h>\r
-\r
+#include <linux/random.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+
+#define DEBUG_SUBSYSTEM S_NAL
+
 #include <libcfs/kp30.h>
 #include <lnet/lnet.h>
-#include <lnet/lib-lnet.h>\r
-\r
+#include <lnet/lib-lnet.h>
 #include <portals/p30.h>
-\r
-#define PJK_UT_MSG(fmt...) do{printk("<1>ptllnd: %-30s ",__FUNCTION__);printk(fmt);}while(0)
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+/*
+ * Define this to enable console debug logging
+ * and simulation
+ */
+//#define PJK_DEBUGGING
+
+/*
+ * This was used for some single node testing
+ * which has some hacks to allow packets that come
+ * back on the lookback LND to have their address
+ * fixed up, so that match MD's properly.  And you
+ * can setup a connection with your self and transfer data.
+ * WARNING: This was for UNIT testing purposes only.
+ */
+//#define TESTING_WITH_LOOPBACK
+
+
+
+#define PTL_RESERVED_MATCHBITS  0x100   /* below this value is reserved
+                                         * above is for bult data transfer */
+#define LNET_MSG_MATCHBITS       0       /* the value for the message channel */
+
+#if CONFIG_SMP
+# define PTLLND_N_SCHED         num_online_cpus()   /* # schedulers */
+#else
+# define PTLLND_N_SCHED         1                   /* # schedulers */
+#endif
+
+
+
+/* defaults for modparams/tunables */
+#define PTLLND_NTX              32         /* # tx descs */
+#define PTLLND_NTX_NBLK         256        /* # reserved tx descs */
+#define PTLLND_NRX              (64 * num_online_cpus()) /* # rx desc */
+#define PTLLND_CONCURRENT_PEERS 1152       /* # nodes all talking at once to me */
+#define PTLLND_CKSUM            0          /* checksum kptl_msg_t? 0 = Diabled */
+#define PTLLND_TIMEOUT          50         /* default comms timeout (seconds) */
+#define PTLLND_PORTAL           9          /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_RXB_NPAGES       1          /* Number of pages for a single RX Buffer */
+#define PTLLND_CREDITS          256        /* concurrent sends */
+#define PTLLND_PEERCREDITS      8          /* concurrent sends to 1 peer*/
+#define PTLLND_MAX_MSG_SIZE     512        /* Maximum message size */
+#define PTLLND_PEER_HASH_SIZE   101        /* # of buckets in peer hash table */
+
+/* tunables fixed at compile time */
+#define PTLLND_CREDIT_HIGHWATER (*kptllnd_tunables.kptl_peercredits-1)  /* when to eagerly return credits */
+#define PTLLND_TIMEOUT_SEC      3          /* How often we check a subset of the peer hash table for timeout*/
+
+/************************/
+/* derived constants... */
+/* TX messages (shared by all connections) */
+#define PTLLND_TX_MSGS()       (*kptllnd_tunables.kptl_ntx +       \
+                               *kptllnd_tunables.kptl_ntx_nblk)
+
+
+typedef struct
+{
+        int             *kptl_ntx;              /* # tx descs */
+        int             *kptl_ntx_nblk;         /* # reserved tx descs */
+        int             *kptl_concurrent_peers; /* max # nodes all talking to me */
+        int             *kptl_cksum;            /* checksum kptl_msg_t? */
+        int             *kptl_timeout;          /* comms timeout (seconds) */
+        int             *kptl_portal;           /* portal number */
+        int             *kptl_rxb_npages;       /* number of pages for rx buffer */
+        int             *kptl_credits;          /* number of credits */
+        int             *kptl_peercredits;      /* number of credits */
+        int             *kptl_max_immd_size;    /* max immd message size*/
+        int             *kptl_peer_hash_table_size; /* # slots in peer hash table */
+        
+#ifdef PJK_DEBUGGING        
+        int             *kptl_simulation_bitmap;/* simulation bitmap */
+#endif
+        
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        struct ctl_table_header *kptl_sysctl;    /* sysctl interface */
+#endif
+} kptl_tunables_t;
+
+
+
+#include "ptllnd_wire.h"
+
+/***********************************************************************/
+
+typedef struct kptl_data kptl_data_t;
+typedef struct kptl_rx_buffer kptl_rx_buffer_t;
+typedef struct kptl_peer kptl_peer_t;
+
+#define POSTED_OBJECT_TYPE_RESERVED     0
+#define POSTED_OBJECT_TYPE_TX           1
+#define POSTED_OBJECT_TYPE_RXB          2
+
+typedef struct
+{
+        __u32 pof_type : 2;
+}kptl_posted_object_flags_t;
+
+typedef struct kptl_posted_object
+{
+        kptl_data_t                    *po_kptllnd_data; /* LND Instance Data */
+        kptl_posted_object_flags_t      po_flags;        /* flags and state   */
+} kptl_posted_object_t;
+
+typedef struct kptl_rx                          /* receive message */
+{
+        struct list_head        rx_list;        /* queue for attention */
+        atomic_t                rx_refcount;
+        kptl_rx_buffer_t       *rx_rxb;         /* the rx buffer pointer */
+        kptl_msg_t             *rx_msg;
+        int                     rx_nob;         /* the number of bytes rcvd */
+        ptl_process_id_t        rx_initiator;   /* who send the packet */
+        kptl_peer_t            *rx_peer;        /* pointer to peer */
+        size_t                  rx_payload[0];  /* payload QQQ*/
+} kptl_rx_t;
+
+typedef struct kptl_rx_buffer_pool
+{
+        spinlock_t              rxbp_lock;
+        struct list_head        rxbp_list;
+        int                     rxbp_count;     /* the number of elements in the list   */
+        int                     rxbp_reserved;  /* the number currently reserved        */
+        int                     rxbp_shutdown;  /* the shutdown flag for the pool       */
+        int                     rxbp_posted;    /* the number of elements posted        */
+}kptl_rx_buffer_pool_t;
+
+typedef enum
+{
+        RXB_STATE_UNINITIALIZED  = 0,
+        RXB_STATE_IDLE           = 1,
+        RXB_STATE_POSTED         = 2,
+}kptl_rxb_state_t;
+
+struct kptl_rx_buffer
+{
+        /* NB - becuase this buffer is assigned to a MD's usr_ptr
+         * It MUST have kptl_posted_object_t as the first member
+         * so that the real type of the element can be determined
+         */
+        kptl_posted_object_t    rxb_po;
+        kptl_rx_buffer_pool_t  *rxb_pool;
+        struct list_head        rxb_list;       /* for the rxb_pool list */
+        struct list_head        rxb_repost_list;/* for the kptl_sched_rxbq list*/
+        kptl_rxb_state_t        rxb_state;      /* the state of this rx buffer*/
+        atomic_t                rxb_refcount;   /* outstanding rx */
+        ptl_handle_md_t         rxb_mdh;        /* the portals memory descriptor (MD) handle */
+        void                   *rxb_buffer;     /* the buffer */
+
+};
+
+typedef enum
+{
+        TX_STATE_UNINITIALIZED          = 0,
+        TX_STATE_ON_IDLE_QUEUE          = 1,
+        TX_STATE_ALLOCATED              = 2,
+        TX_STATE_WAITING_CREDITS        = 3,
+        TX_STATE_WAITING_RESPONSE       = 4
+}kptl_tx_state_t;
+
+typedef enum
+{
+        TX_TYPE_RESERVED                = 0,
+        TX_TYPE_SMALL_MESSAGE           = 1,
+        TX_TYPE_LARGE_PUT               = 2,
+        TX_TYPE_LARGE_GET               = 3,
+        TX_TYPE_LARGE_PUT_RESPONSE      = 4,
+        TX_TYPE_LARGE_GET_RESPONSE      = 5,
+}kptl_tx_type_t;
+
+typedef struct kptl_tx                           /* transmit message */
+{
+        /* NB - becuase this buffer is assigned to a MD's usr_ptr
+         * It MUST have kptl_posted_object_t as the first member
+         * so that the real type of the element can be determined
+         */
+        kptl_posted_object_t    tx_po;
+        struct list_head        tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        struct list_head        tx_schedlist; /* queue on idle_txs ibc_tx_queue etc. */
+        atomic_t                tx_refcount;  /* Posted Buffer refrences count*/
+        kptl_tx_state_t         tx_state;     /* the state of this tx descriptor */
+        int                     tx_seen_send_end; /* if we've seen a SEND_END event */
+        int                     tx_seen_reply_end; /* if we've seen a REPLY_END event */
+        kptl_tx_type_t          tx_type;      /* type of transfer */
+        int                     tx_status;    /* the status of this tx descriptor */
+        int                     tx_isnblk;    /* I'm reserved for non-blocking sends */
+        ptl_handle_md_t         tx_mdh;       /* the portals memory descriptor (MD) handle */
+        ptl_handle_md_t         tx_mdh_msg;   /* the portals MD handle for the initial message */
+        lnet_msg_t             *tx_ptlmsg;    /* the cookie for finalize */
+        lnet_msg_t             *tx_ptlmsg_reply; /* the cookie for the reply message */
+        kptl_msg_t             *tx_msg;       /* the message data */
+        kptl_peer_t            *tx_peer;      /* the peer this is waiting on */
+        unsigned long           tx_deadline;  /* deadline */
+        kptl_rx_t              *tx_associated_rx; /* Associated RX for Bulk RDMA */
+
+        unsigned int            tx_payload_niov;
+        struct iovec           *tx_payload_iov;
+        lnet_kiov_t            *tx_payload_kiov;
+        unsigned int            tx_payload_offset;
+        int                     tx_payload_nob;
+} kptl_tx_t;
+
+
+typedef enum
+{
+        PEER_STATE_UNINITIALIZED        = 0,
+        PEER_STATE_WAITING_HELLO        = 1,
+        PEER_STATE_ACTIVE               = 2,
+        PEER_STATE_CANCELED             = 3,
+}kptllnd_peer_state_t;
+
+struct kptl_peer
+{
+        struct list_head        peer_list;
+        atomic_t                peer_refcount;          /* The current refrences */
+        kptllnd_peer_state_t    peer_state;
+        kptl_data_t            *peer_kptllnd_data;      /* LND Instance Data */
+        spinlock_t              peer_lock;              /* serialize */
+        struct list_head        peer_pending_txs;       /* queue of pending txs */
+        struct list_head        peer_active_txs;        /* queue of activce txs */
+        int                     peer_active_txs_change_counter;/* updated when peer_active_txs changes*/
+        lnet_nid_t              peer_nid;               /* who's on the other end(s) */
+        __u64                   peer_incarnation;       /* peer's incarnation */
+        __u64                   peer_tx_seqnum;         /* next seq# to send with*/
+        int                     peer_credits;           /* number of send credits */
+        int                     peer_outstanding_credits;/* number of peer credits */
+        __u64                   peer_next_matchbits;    /* Next value to use for tx desc matchbits */
+        __u64                   peer_last_matchbits_seen; /* last matchbits seen*/
+};
+
+
+
+struct kptl_data
+{
+        int                     kptl_init;             /* initialisation state */
+        volatile int            kptl_shutdown;         /* shut down? */
+        atomic_t                kptl_nthreads;         /* # live threads */
+        lnet_ni_t              *kptl_ni;               /* _the_ LND instance */
+        ptl_handle_ni_t         kptl_nih;              /* network inteface handle */
+        ptl_process_id_t        kptl_portals_id;       /* Portals ID of interface */
+        __u64                   kptl_incarnation;      /* which one am I */
+        ptl_handle_eq_t         kptl_eqh;              /* Event Queue (EQ) */
+
+        spinlock_t              kptl_sched_lock;       /* serialise the next 3 members*/
+        wait_queue_head_t       kptl_sched_waitq;      /* schedulers sleep here */
+        struct list_head        kptl_sched_txq;        /* tx requiring attention */
+        struct list_head        kptl_sched_rxq;        /* rx requiring attention */
+        struct list_head        kptl_sched_rxbq;       /* rxb requiring reposting */
+
+        kptl_rx_buffer_pool_t   kptl_rx_buffer_pool;   /* rx buffer pool */
+        cfs_mem_cache_t*        kptl_rx_cache;         /* rx descripter cache */
+
+        struct kptl_tx         *kptl_tx_descs;         /* the tx descriptors array */
+        spinlock_t              kptl_tx_lock;          /* serialise the next 4 members*/
+        struct list_head        kptl_idle_txs;         /* idle tx descriptors */
+        struct list_head        kptl_idle_nblk_txs;    /* idle reserved tx descriptors */
+        wait_queue_head_t       kptl_idle_tx_waitq;    /* block here for tx descriptor */
+
+        rwlock_t                kptl_peer_rw_lock;     /* lock for peer table */
+        struct list_head       *kptl_peers;            /* hash table of all my known peers */
+        struct list_head        kptl_canceled_peers;   /* peers in the canceld state */
+        int                     kptl_canceled_peers_counter; /* updated when canceled_peers is modified*/
+        int                     kptl_peer_hash_size;   /* size of kptl_peers */
+        atomic_t                kptl_npeers;           /* # peers extant */
+
+};
+
+typedef struct kptl_stats
+{
+        int                     kps_incoming_checksums_calculated;
+        int                     kps_incoming_checksums_invalid;
+        int                     kps_cleaning_caneled_peers;     /* MP Safe*/
+        int                     kps_checking_buckets;
+        int                     kps_too_many_peers;             /* MP Safe*/
+        int                     kps_peers_created;              /* MP Safe*/
+        int                     kps_no_credits;
+        int                     kps_saving_last_credit;
+        int                     kps_rx_allocated;
+        int                     kps_rx_released;
+        int                     kps_rx_allocation_failed;
+        int                     kps_tx_allocated;              /* MP Safe*/
+        int                     kps_tx_released;               /* MP Safe*/
+        int                     kpt_tx_allocation_failed;      /* MP Safe*/
+        int                     kpx_recv_delayed;
+        int                     kpx_send_routing;
+        int                     kpx_send_target_is_router;
+}kptl_stats_t;
+
+/*
+ * Note: Stats update are not atomic (for performance reasons)
+ * and therefore not MP safe.  They are more an indiciation of
+ * things that are going on, as opposed to a actual count.
+ *
+ * (e.g. if kps_checking_buckets wasn't incrementing at some
+ *  number per second, that would be an indication that the
+ *  scheduler thread is stuck or stopped)
+ *
+ * However where possible the update of the stats are placed inside
+ * a spinlock to make them consistent, these are marked MP Safe above.
+ *
+ */
+#define STAT_UPDATE(n) do{ ++kptllnd_stats.n; }while(0)
+
+
+enum
+{
+    PTLLND_INIT_NOTHING     = 0,
+    PTLLND_INIT_DATA        = 1,
+    PTLLND_INIT_TXD         = 2,
+    PTLLND_INIT_RXD         = 3,
+    PTLLND_INIT_ALL         = 4,
+};
+
+
+extern kptl_tunables_t  kptllnd_tunables;
+extern kptl_stats_t     kptllnd_stats;
+
+int kptllnd_startup (
+        lnet_ni_t *ni);
+
+void kptllnd_shutdown (
+        lnet_ni_t *ni);
+
+int kptllnd_ctl(
+        lnet_ni_t *ni,
+        unsigned int cmd,
+        void *arg);
+
+int kptllnd_send (
+        lnet_ni_t *ni,
+        void *private,
+        lnet_msg_t *lntmsg);
+
+int kptllnd_recv (
+        lnet_ni_t *ni,
+        void *private,
+        lnet_msg_t *lntmsg,
+        int delayed,
+        unsigned int niov,
+        struct iovec *iov,
+        lnet_kiov_t *kiov,
+        unsigned int offset,
+        unsigned int mlen,
+        unsigned int rlen);
+
+void kptllnd_eq_callback(
+        ptl_event_t *evp);
+
+int  kptllnd_scheduler(
+        void *arg);
+
+int  kptllnd_thread_start(
+        int (*fn)(void *arg),
+        int id,
+        kptl_data_t *kptllnd_data);
+
+int  kptllnd_tunables_init(void);
+void kptllnd_tunables_fini(void);
+
+const char *get_ev_type_string(
+        int evtype);
+
+const char *get_msg_type_string(
+        int type);
+
+kptl_stats_t* kpttllnd_get_stats(void);
+
+void
+kptllnd_posted_object_setup(
+        kptl_posted_object_t* posted_obj,
+        kptl_data_t *kptllnd_data,
+        int type);
+
+/*
+ * RX BUFFER SUPPORT FUNCTIONS
+ */
+
+void
+kptllnd_rx_buffer_pool_init(
+        kptl_rx_buffer_pool_t *rxbp);
+
+void
+kptllnd_rx_buffer_pool_fini(
+        kptl_rx_buffer_pool_t *rxbp);
+
+int
+kptllnd_rx_buffer_pool_reserve(
+        kptl_rx_buffer_pool_t *rxbp,
+        kptl_data_t *kptllnd_data,
+        int count);
+
+void
+kptllnd_rx_buffer_pool_unreserve(
+        kptl_rx_buffer_pool_t *rxbp,
+        int count);
+
+void
+kptllnd_rx_buffer_callback(
+        ptl_event_t *ev);
+
+void
+kptllnd_rx_buffer_scheduled_post(
+        kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_post_handle_error(
+        kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_decref(
+        kptl_rx_buffer_t *rxb,
+        const char *owner);
+
+/*
+ * RX SUPPORT FUNCTIONS
+ */
+void
+kptllnd_rx_scheduler_handler(
+        kptl_rx_t *rx);
+
+void
+kptllnd_rx_addref(
+        kptl_rx_t *rx,
+        const char *owner);
+
+void
+kptllnd_rx_decref(
+        kptl_rx_t *rx,
+        const char *owner,
+        kptl_data_t *kptllnd_data);
+
+/*
+ * PEER SUPPORT FUNCTIONS
+ */
+void
+kptllnd_peer_decref (
+        kptl_peer_t *peer,
+        const char *owner);
+void
+kptllnd_peer_addref (
+        kptl_peer_t *peer,
+        const char *owner);
+
+int
+kptllnd_peer_del (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid);
+
+void
+kptllnd_peer_cancel(
+        kptl_peer_t *peer);
+
+void
+kptllnd_peer_queue_tx (
+        kptl_peer_t *peer,
+        kptl_tx_t *tx);
+
+void
+kptllnd_peer_queue_bulk_rdma_tx_locked(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx);
+
+void
+kptllnd_peer_dequeue_tx(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx);
+void
+kptllnd_peer_dequeue_tx_locked(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx);
+
+int
+kptllnd_peer_connect (
+        kptl_tx_t *tx,
+        lnet_nid_t nid );
+
+void
+kptllnd_peer_check_sends (
+        kptl_peer_t *peer );
+void
+kptllnd_peer_check_bucket (
+        int idx,
+        kptl_data_t *kptllnd_data);
+
+void
+kptllnd_tx_launch (
+        kptl_tx_t *tx,
+        lnet_nid_t target_nid,
+        lnet_msg_t *ptlmsg );
+
+kptl_peer_t *
+kptllnd_peer_find (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid);
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid,
+        kptl_msg_t *msg);
+
+static inline struct list_head *
+kptllnd_nid2peerlist (kptl_data_t *kptllnd_data,lnet_nid_t nid)
+{
+        unsigned int hash = ((unsigned int)nid) % kptllnd_data->kptl_peer_hash_size;
+
+        return (&kptllnd_data->kptl_peers [hash]);
+}
+
+/*
+ * TX SUPPORT FUNCTIONS
+ */
+int
+kptllnd_setup_tx_descs (
+        kptl_data_t *kptllnd_data);
+
+void
+kptllnd_cleanup_tx_descs(
+        kptl_data_t *kptllnd_data);
+
+void
+kptllnd_tx_addref(
+        kptl_tx_t *tx);
+void
+kptllnd_tx_decref(
+        kptl_tx_t *tx);
+void
+kptllnd_tx_scheduled_decref(
+        kptl_tx_t *tx);
+void
+kptllnd_tx_done (
+        kptl_tx_t *tx);
+kptl_tx_t *
+kptllnd_get_idle_tx(
+        kptl_data_t *kptllnd_data,
+        int may_block,
+        kptl_tx_type_t purpose);
+
+void
+kptllnd_tx_callback(
+        ptl_event_t *ev);
+
+/*
+ * MESSAGE SUPPORT FUNCTIONS
+ */
+void
+kptllnd_init_msg(
+        kptl_msg_t *msg,
+        int type,
+        int body_nob);
+
+void
+kptllnd_msg_pack(
+        kptl_msg_t *msgp,
+        int credits,
+        lnet_nid_t dstnid,
+        __u64 dststamp,
+        __u64 seq,
+        kptl_data_t *kptllnd_data);
+
+int
+kptllnd_msg_unpack(
+        kptl_msg_t *msg,
+        int nob,
+        kptl_data_t *kptllnd_data);
+
+/*
+ * MISC SUPPORT FUNCTIONS
+ */
+
+typedef union {
+        struct iovec iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t kiov[PTL_MD_MAX_IOV];
+}tempiov_t;
+
+void
+kptllnd_setup_md(
+        kptl_data_t     *kptllnd_data,
+        ptl_md_t        *md,
+        unsigned int     op,
+        kptl_tx_t       *tx,
+        unsigned int     payload_niov,
+        struct iovec    *payload_iov,
+        lnet_kiov_t     *payload_kiov,
+        unsigned int     payload_offset,
+        int              payload_nob,
+        tempiov_t       *tempiov);
+
+int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data);
+int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data);
+int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data);
+
+static inline lnet_nid_t ptl2lnetnid(kptl_data_t *kptllnd_data,ptl_nid_t portals_nid)
+{
+        return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_ni->ni_nid),   PTL_NIDADDR(portals_nid) );
+}
+
+static inline ptl_nid_t lnet2ptlnid(kptl_data_t *kptllnd_data,lnet_nid_t lnet_nid)
+{
+        return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_portals_id.nid), PTL_NIDADDR(lnet_nid) );
+}
+
+#ifdef PJK_DEBUGGING
+
+#define PJK_UT_MSG_ALWAYS(fmt, a...)                    \
+do{                                                     \
+        printk("<1>ptllnd:%-30s:%u:",__FUNCTION__,cfs_curproc_pid());       \
+        printk(fmt,## a);                               \
+        CDEBUG(D_TRACE,fmt,## a);                       \
+}while(0)
+
+#define PJK_UT_MSG_SIMULATION(fmt, a...)        PJK_UT_MSG_ALWAYS(fmt, ## a )
+
+
+#if 1
+#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
+#else
+#define PJK_UT_MSG_DATA(fmt, a...)              PJK_UT_MSG_ALWAYS(fmt, ## a )
+#endif
+
+#if 1
+#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
+#else
+#define PJK_UT_MSG(fmt, a...)                   PJK_UT_MSG_ALWAYS(fmt, ## a )
+#endif
+
+
+#define SIMULATION_FAIL_BLOCKING_TX_PUT_ALLOC   0       /* 0x00000001 */
+#define SIMULATION_FAIL_BLOCKING_TX_GET_ALLOC   1       /* 0x00000002 */
+#define SIMULATION_FAIL_BLOCKING_TX             2       /* 0x00000004 */
+#define SIMULATION_FAIL_BLOCKING_RX_ALLOC       3       /* 0x00000008 */
+
+#define IS_SIMULATION_ENABLED(x) \
+        (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
+
+
+#else
+
+
+#define PJK_UT_MSG_ALWAYS(fmt, a...)            do{}while(0)
+#define PJK_UT_MSG_SIMULATION(fmt, a...)        do{}while(0)
+#define PJK_UT_MSG_DATA(fmt, a...)              do{}while(0)
+#define PJK_UT_MSG(fmt, a...)                   do{}while(0)
+
+#define IS_SIMULATION_ENABLED(x)                0
+
+#endif
+
diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c
new file mode 100644 (file)
index 0000000..1978dae
--- /dev/null
@@ -0,0 +1,1052 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "ptllnd.h"
+
+void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data);
+
+void
+kptllnd_setup_md(
+        kptl_data_t     *kptllnd_data,
+        ptl_md_t        *md,
+        unsigned int     op,
+        kptl_tx_t       *tx,
+        unsigned int     payload_niov,
+        struct iovec    *payload_iov,
+        lnet_kiov_t      *payload_kiov,
+        unsigned int     payload_offset,
+        int              payload_nob,
+        tempiov_t       *tempiov)
+{
+        unsigned int niov = 0;
+
+        PJK_UT_MSG_DATA("%s nob=%d offset=%d niov=%d\n",
+                op == PTL_MD_OP_GET ? "GET" : "PUT",
+                payload_nob,payload_offset,payload_niov);
+
+        /* One but not both of iov or kiov must be NULL (XOR) */
+        LASSERT( (payload_iov != 0 && payload_kiov == 0) ||
+                 (payload_iov == 0 && payload_kiov != 0 ) );
+
+        /* We have a put or get operation*/
+        LASSERT( op == PTL_MD_OP_GET || op == PTL_MD_OP_PUT);
+
+
+        /* Only one operation then unlink */
+        md->threshold = 1;
+
+        /* Get operations need threshold +1 to handle the
+         * reply operation
+         */
+        if( op == PTL_MD_OP_GET)
+                md->threshold++;
+
+        /* setup the options*/
+        md->options = op;
+
+        /* If this is a PUT then we need to disable ACK */
+        /* we don't need an ACK, we'll get a callback when it is complete */
+        if( op == PTL_MD_OP_PUT)
+                md->options |= PTL_MD_ACK_DISABLE;
+
+        /* we don't care about the start event */
+        md->options |= PTL_MD_EVENT_START_DISABLE;
+
+        /* point back to this TX descriptor so we know what to complete
+         * when the event is triggered */
+        md->user_ptr = tx;
+
+        md->eq_handle = kptllnd_data->kptl_eqh;
+        if (payload_iov != NULL){
+
+                while (payload_offset >= payload_iov->iov_len) {
+                        payload_offset -= payload_iov->iov_len;
+                        payload_iov++;
+                        payload_niov--;
+                        LASSERT (payload_niov > 0);
+                }
+
+                while(payload_nob){
+                        LASSERT( payload_offset < payload_iov->iov_len);
+                        LASSERT (payload_niov > 0);
+                        LASSERT (niov < sizeof(tempiov->iov)/sizeof(tempiov->iov[0]));
+
+                        tempiov->iov[niov].iov_base = payload_iov->iov_base + payload_offset;
+                        tempiov->iov[niov].iov_len  = min((int)(payload_iov->iov_len - payload_offset),
+                                                (int)payload_nob);
+
+                        payload_offset = 0;
+                        payload_nob -= tempiov->iov[niov].iov_len;
+                        payload_iov++;
+                        payload_niov--;
+                        niov++;
+                }
+
+                md->start = tempiov->iov;
+                md->options |= PTL_MD_IOVEC;
+
+        }else{
+
+
+                while (payload_offset >= payload_kiov->kiov_len) {
+                        payload_offset -= payload_kiov->kiov_len;
+                        payload_kiov++;
+                        payload_niov--;
+                        LASSERT (payload_niov > 0);
+                }
+
+                while(payload_nob){
+                        LASSERT( payload_offset < payload_kiov->kiov_len);
+                        LASSERT (payload_niov > 0);
+                        LASSERT (niov < sizeof(tempiov->kiov)/sizeof(tempiov->kiov[0]));
+
+                        tempiov->kiov[niov].kiov_page   = payload_kiov->kiov_page;
+                        tempiov->kiov[niov].kiov_offset = payload_kiov->kiov_offset + payload_offset;
+                        tempiov->kiov[niov].kiov_len    = min((int)(payload_kiov->kiov_len - payload_offset),
+                                                        (int)payload_nob);
+
+                        payload_offset = 0;
+                        payload_nob -= tempiov->kiov[niov].kiov_len;
+                        payload_kiov++;
+                        payload_niov--;
+                        niov++;
+                }
+
+                md->start = tempiov->kiov;
+                md->options |= PTL_MD_KIOV;
+        }
+
+        /*
+         * When using PTL_MD_IOVEC or PTL_MD_KIOV this is not
+         * length, rather it is # iovs
+         */
+        md->length = niov;
+}
+
+int
+kptllnd_start_bulk_rdma(
+        kptl_data_t     *kptllnd_data,
+        kptl_rx_t       *rx,
+        lnet_msg_t      *lntmsg,
+        unsigned int     op,
+        unsigned int     payload_niov,
+        struct iovec    *payload_iov,
+        lnet_kiov_t     *payload_kiov,
+        unsigned int     payload_offset,
+        int              payload_nob)
+{
+        kptl_tx_t       *tx;
+        ptl_md_t         md;
+        ptl_err_t        ptl_rc;
+        ptl_err_t        ptl_rc2;
+        int              rc;
+        tempiov_t        tempiov;
+        kptl_msg_t      *rxmsg = rx->rx_msg;
+        kptl_peer_t     *peer = rx->rx_peer;
+
+
+        /*
+         * Get an idle tx descriptor
+         * may NOT block: (That's the "0" param)
+         */
+        LASSERT(op ==  PTL_MD_OP_GET || op == PTL_MD_OP_PUT);
+        tx = kptllnd_get_idle_tx(kptllnd_data,0,
+                op == PTL_MD_OP_GET ? TX_TYPE_LARGE_PUT_RESPONSE :
+                                      TX_TYPE_LARGE_GET_RESPONSE);
+        if(tx == NULL){
+                CERROR ("Can't start bulk rdma %d to "LPX64": tx descs exhausted\n",
+                        op, rx->rx_initiator.nid);
+                return -ENOMEM;
+        }
+
+        /*
+         * Attach the RX to the TX and take a refrence
+         */
+        tx->tx_associated_rx = rx;
+        kptllnd_rx_addref(rx,"tx");
+
+        PJK_UT_MSG_DATA(">>> %s rx=%p associated with tx=%p\n",
+                op == PTL_MD_OP_GET ? "GET" : "PUT",
+                rx,tx);
+        PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+                rxmsg->ptlm_u.req.kptlrm_matchbits);
+
+        /*
+         * Setup the MD
+         */
+        kptllnd_setup_md(kptllnd_data,&md,op,tx,
+                payload_niov,payload_iov,payload_kiov,
+                payload_offset,payload_nob,&tempiov);
+
+        spin_lock(&peer->peer_lock);
+
+        /*
+         * Attach the MD
+         */
+        ptl_rc = PtlMDBind(
+                kptllnd_data->kptl_nih,
+                md,
+                PTL_UNLINK,
+                &tx->tx_mdh);
+        if(ptl_rc != PTL_OK){
+                CERROR("PtlMDBind failed %d\n",ptl_rc);
+
+                spin_unlock(&peer->peer_lock);
+                /*
+                 * Just drop the ref for this MD because it was never
+                 * posted to portals
+                 */
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                rc = -ENOMEM;
+                goto end;
+        }
+
+        /*
+         * And save the portals message
+         */
+        tx->tx_ptlmsg = lntmsg;
+
+        /*
+         * Queue the request on the peer
+         */
+        kptllnd_peer_queue_bulk_rdma_tx_locked(peer,tx);
+
+        /*
+         * Grab a ref so the TX doesn't dissappear
+         */
+        kptllnd_tx_addref(tx);
+
+        spin_unlock(&peer->peer_lock);
+
+
+        /*
+         * Do the Put
+         */
+        if( op == PTL_MD_OP_PUT)
+        {
+                ptl_rc = PtlPut (
+                            tx->tx_mdh,
+                            PTL_NOACK_REQ,         /* we dont need an ack */
+                            rx->rx_initiator,      /* peer "address" */
+                            *kptllnd_tunables.kptl_portal,         /* portal */
+                            0,                     /* cookie */
+                            rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */
+                            0,                     /* offset - unused */
+                            0);                    /* header data */
+        }else{
+                ptl_rc = PtlGet (
+                            tx->tx_mdh,
+                            rx->rx_initiator,      /* peer "address" */
+                            *kptllnd_tunables.kptl_portal,         /* portal */
+                            0,                     /* cookie */
+                            rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */
+                            0);                    /* offset - unused*/
+        }
+
+        if(ptl_rc != PTL_OK){
+                CERROR("Ptl%s failed: %d\n",
+                        op == PTL_MD_OP_GET ? "Get" : "Put",ptl_rc);
+
+                spin_lock(&peer->peer_lock);
+
+                /*
+                 * Unlink the MD because it's not yet in use
+                 * this should happen immediately
+                 */
+                LASSERT(atomic_read(&tx->tx_refcount)>1);
+                ptl_rc2 = PtlMDUnlink(tx->tx_mdh);
+                LASSERT(ptl_rc2 == PTL_OK);
+
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                /* If we have LUSTRE Portals UNLINK semantics
+                 * we'll get an unlink event.  If we have standard
+                 * Portals semantics we decref the TX explicitly here
+                 */
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                kptllnd_tx_decref(tx);
+#endif
+                /*
+                 * We are returning failure so we don't
+                 * want tx_done to finalize the message
+                 * so we set it to zero
+                 */
+                tx->tx_ptlmsg = 0;
+
+                kptllnd_peer_dequeue_tx_locked(peer,tx);
+                tx->tx_peer = NULL;
+
+                spin_unlock(&peer->peer_lock);
+
+                rc = -ENOMEM;
+                goto end;
+        }
+
+        rc = 0;
+
+end:
+        /*
+         * Release our temporary reference
+         * (this one could be the last)
+         */
+        kptllnd_tx_decref(tx);
+
+        PJK_UT_MSG("<<< rc=%d\n",rc);
+        return rc;
+}
+
+
+void
+kptlnd_do_put(
+        kptl_tx_t       *tx,
+        lnet_msg_t      *lntmsg,
+        lnet_hdr_t      *hdr,
+        kptl_data_t     *kptllnd_data,
+        lnet_process_id_t target,
+        unsigned int     payload_niov,
+        struct iovec    *payload_iov,
+        lnet_kiov_t     *payload_kiov,
+        unsigned int     payload_offset,
+        unsigned int     payload_nob)
+{
+        LASSERT(tx != NULL);
+
+        tx->tx_payload_niov     = payload_niov;
+        tx->tx_payload_iov      = payload_iov;
+        tx->tx_payload_kiov     = payload_kiov;
+        tx->tx_payload_offset   = payload_offset;
+        tx->tx_payload_nob      = payload_nob;
+
+        tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr;
+        kptllnd_init_msg (tx->tx_msg,
+                          PLTLND_MSG_TYPE_PUT,
+                          sizeof(kptl_request_msg_t));
+        kptllnd_tx_launch(tx, target.nid,lntmsg);
+}
+
+int
+kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov;
+        struct iovec     *payload_iov = lntmsg->msg_iov;
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kptl_tx_t        *tx = NULL;
+        kptl_data_t      *kptllnd_data = ni->ni_data;
+        int               nob;
+        int               rc;
+        kptl_rx_t        *rx = NULL;
+
+        PJK_UT_MSG_DATA(">>> SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+        PJK_UT_MSG_DATA("nob=%d nov=%d offset=%d to %s r=%d\n",
+               payload_nob, payload_niov, payload_offset,
+               libcfs_id2str(target),
+               routing);
+
+        if(routing)
+                STAT_UPDATE(kpx_send_routing);
+        if(target_is_router)
+                STAT_UPDATE(kpx_send_target_is_router);
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* Thread context */
+        LASSERT (!in_interrupt());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        /*
+         * we rely on this being true, as we only store hdr
+         * in the tx descriptor, and just ignore type
+         */
+        LASSERT(hdr->type == type);
+
+        switch (type) {
+        default:
+                LBUG();
+                return -EINVAL;
+
+        case LNET_MSG_PUT:
+                PJK_UT_MSG_DATA("LNET_MSG_PUT\n");
+
+                /*
+                 * Get an idle tx descriptor
+                 * may block: caller is app thread (That's the "1" param)
+                 */
+                tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_PUT);
+                if(tx == NULL){
+                        CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+                                type, target.nid);
+                        return -ENOMEM;
+                }
+
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+                if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+                        break;
+
+                kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target,
+                                payload_niov,payload_iov,
+                                payload_kiov,payload_offset,payload_nob);
+
+                PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+                return 0;
+
+        case LNET_MSG_GET:
+
+                PJK_UT_MSG_DATA("LNET_MSG_GET nob=%d\n",lntmsg->msg_md->md_length);
+
+                /*
+                 * Get an idle tx descriptor
+                 * may block: caller is app thread (That's the "1" param)
+                 */
+                tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_GET);
+                if(tx == NULL){
+                        CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+                                type, target.nid);
+                        return -ENOMEM;
+                }
+
+                /*
+                 * If routing go immediate
+                 */
+                if(target_is_router || routing)
+                        break;
+
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+                        break;
+
+                tx->tx_payload_offset = 0;
+                tx->tx_payload_niov = lntmsg->msg_md->md_niov;
+                tx->tx_payload_nob  = lntmsg->msg_md->md_length;
+
+                if((lntmsg->msg_md->md_options & PTL_MD_KIOV) != 0){
+                        tx->tx_payload_iov = 0;
+                        tx->tx_payload_kiov = lntmsg->msg_md->md_iov.kiov;
+                }else{
+                        tx->tx_payload_iov = lntmsg->msg_md->md_iov.iov;
+                        tx->tx_payload_kiov = 0;
+                }
+
+
+                tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr;
+                kptllnd_init_msg (tx->tx_msg,
+                                  PTLLND_MSG_TYPE_GET,
+                                  sizeof(kptl_request_msg_t));
+
+                tx->tx_ptlmsg_reply =
+                        lnet_create_reply_msg(kptllnd_data->kptl_ni,lntmsg);
+
+                goto launch;
+
+        case LNET_MSG_ACK:
+                PJK_UT_MSG_DATA("LNET_MSG_ACK\n");
+                LASSERT (payload_nob == 0);
+                break;
+
+        case LNET_MSG_REPLY:
+                PJK_UT_MSG_DATA("LNET_MSG_REPLY\n");
+
+                /*
+                 * Reply's private is the incoming rx descriptor
+                 */
+                rx = private;
+                LASSERT(rx != NULL);
+
+                if(lntmsg==NULL)
+                {
+                        /*
+                         * Get an idle tx descriptor
+                         * may NOT block That's the "0" param)
+                         */
+                        tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_LARGE_PUT);
+                        if(tx == NULL){
+                                CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+                                        type, target.nid);
+                                return -ENOMEM;
+                        }
+
+                        /* Is the payload small enough not to need RDMA? */
+                        nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+                        if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+                                break;
+
+                        kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target,
+                                        payload_niov,payload_iov,
+                                        payload_kiov,payload_offset,payload_nob);
+
+                        PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+                        return 0;
+                }else{
+                        /*
+                         * If the request was to NOT do RDMA
+                         * break out and just send back an IMMEDIATE message
+                         */
+                        if (rx->rx_msg->ptlm_type == PTLLND_MSG_TYPE_IMMEDIATE) {
+                                /* RDMA not expected */
+                                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+                                if (nob > *kptllnd_tunables.kptl_max_immd_size) {
+                                        CERROR("REPLY for "LPX64" too big but RDMA not requested:"
+                                               "%d (max for message is %d)\n",
+                                               target.nid, payload_nob,
+                                               *kptllnd_tunables.kptl_max_immd_size);
+                                        CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
+                                               nob, target.nid);
+                                        return -EINVAL;
+                                }
+                                break;
+                        }
+
+
+                        /* Incoming message consistent with RDMA? */
+                        if (rx->rx_msg->ptlm_type != PTLLND_MSG_TYPE_GET) {
+                                CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
+                                       target.nid, rx->rx_msg->ptlm_type);
+                                return -EINVAL;
+                        }
+
+                        rc = kptllnd_start_bulk_rdma(
+                                kptllnd_data,
+                                rx,
+                                lntmsg,
+                                PTL_MD_OP_PUT,
+                                payload_niov,
+                                payload_iov,
+                                payload_kiov,
+                                payload_offset,
+                                payload_nob);
+                        PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS rc=%d\n",rc);
+                        return rc;
+                }
+        }
+
+
+        if(tx == NULL){
+                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+
+                /*
+                 * Get an idle tx descriptor
+                 * may NOT block: (That's the "0" param)
+                 */
+                tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+                if(tx == NULL){
+                        CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+                                type, target.nid);
+                        return -ENOMEM;
+                }
+        }else{
+                PJK_UT_MSG_DATA("Using PTLLND_MSG_TYPE_IMMEDIATE\n");
+                /*
+                 * Repurpose this TX
+                 */
+                tx->tx_type = TX_TYPE_SMALL_MESSAGE;
+
+        }
+
+        LASSERT (offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob])
+                 <= *kptllnd_tunables.kptl_max_immd_size);
+
+        /*
+         * Setup the header
+         */
+        tx->tx_msg->ptlm_u.immediate.kptlim_hdr = *hdr;
+
+        if (payload_nob > 0) {
+                if (payload_kiov != NULL)
+                        lnet_copy_kiov2flat(
+                                *kptllnd_tunables.kptl_max_immd_size,
+                                tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                payload_niov, payload_kiov,
+                                payload_offset, payload_nob);
+                else
+                        lnet_copy_iov2flat(
+                                *kptllnd_tunables.kptl_max_immd_size,
+                                tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                payload_niov, payload_iov,
+                                payload_offset, payload_nob);
+        }
+
+        nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
+        kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE,nob);
+
+
+launch:
+        kptllnd_tx_launch(tx, target.nid,lntmsg);
+        PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+        return 0;
+}
+
+int kptllnd_eager_recv(void *private,void **new_privatep)
+{
+        kptl_rx_t    *rx = private;
+
+
+        LASSERT(rx->rx_nob < *kptllnd_tunables.kptl_max_immd_size);
+
+        /*
+         * Copy the data directly into the RX
+         */
+        memcpy(rx->rx_payload,rx->rx_msg,rx->rx_nob);
+
+        *new_privatep = rx;
+
+        /*
+         * Free the request buffer
+         * will repost of we are the last ones using it
+         */
+        LASSERT(rx->rx_rxb != NULL);
+        kptllnd_rx_buffer_decref(rx->rx_rxb,"rx-eager");
+        rx->rx_rxb = NULL;
+
+
+        /*
+         * Now point the msg buffer at the RX descriptor payload
+         * rather than the RXB (because that is now freed!
+         */
+        rx->rx_msg = (kptl_msg_t*)rx->rx_payload;
+
+        return 0;
+}
+
+
+int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+                  unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        kptl_rx_t    *rx = private;
+        kptl_msg_t   *rxmsg = rx->rx_msg;
+        kptl_data_t  *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data;
+        int           nob;
+        int           rc;
+
+        PJK_UT_MSG_DATA(">>> RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\n");
+        PJK_UT_MSG_DATA("niov=%d offset=%d mlen=%d rlen=%d\n",
+                niov,offset,mlen,rlen);
+
+        LASSERT (mlen <= rlen);
+        LASSERT (mlen >= 0);
+        LASSERT (!in_interrupt());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        if(delayed)
+                STAT_UPDATE(kpx_recv_delayed);
+
+        switch(rxmsg->ptlm_type)
+        {
+        default:
+                LBUG();
+                rc = -EINVAL;
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]);
+                if (nob > *kptllnd_tunables.kptl_max_immd_size) {
+                        CERROR ("Immediate message from "LPX64" too big: %d\n",
+                                rxmsg->ptlm_u.immediate.kptlim_hdr.src_nid, rlen);
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(
+                                niov, kiov, offset,
+                                *kptllnd_tunables.kptl_max_immd_size,
+                                rxmsg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                mlen);
+                else
+                        lnet_copy_flat2iov(
+                                niov, iov, offset,
+                                *kptllnd_tunables.kptl_max_immd_size,
+                                rxmsg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                mlen);
+
+                lnet_finalize (ni, lntmsg, 0);
+                rc = 0;
+                break;
+
+        case PTLLND_MSG_TYPE_GET:
+                PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_GET\n");
+                /* We get called here just to discard any junk after the
+                 * GET hdr. */
+                LASSERT (lntmsg == NULL); /* What is this all about ???*/
+
+                lnet_finalize (ni, lntmsg, 0);
+
+                rc = 0;
+                break;
+
+        case PLTLND_MSG_TYPE_PUT:
+                PJK_UT_MSG_DATA("PLTLND_MSG_TYPE_PUT\n");
+
+                if (mlen == 0) { /* No payload */
+                        lnet_finalize(ni, lntmsg, 0);
+                        rc = 0;
+                }else{
+                        rc = kptllnd_start_bulk_rdma(
+                                kptllnd_data,
+                                rx,
+                                lntmsg,
+                                PTL_MD_OP_GET,
+                                niov,
+                                iov,
+                                kiov,
+                                offset,
+                                mlen);
+                }
+                break;
+        }
+
+        /*
+         * We're done with the RX
+         */
+        kptllnd_rx_decref(rx,"lnet_parse",kptllnd_data);
+
+        PJK_UT_MSG_DATA("<<< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR rc=%d\n",rc);
+        return rc;
+}
+
+
+void
+kptllnd_eq_callback(ptl_event_t *ev)
+{
+        kptl_posted_object_t *po = ev->md.user_ptr;
+
+        /*
+         * Just delegate to the correct callback
+         * based on object type
+         */
+        if(po->po_flags.pof_type == POSTED_OBJECT_TYPE_TX)
+                kptllnd_tx_callback(ev);
+        else
+                kptllnd_rx_buffer_callback(ev);
+}
+
+typedef struct
+{
+        int id;                         /* The unique ID */
+        kptl_data_t *kptllnd_data;     /* pointer to the NAL instance */
+
+}kptllnd_thread_data_t;
+
+void
+kptllnd_thread_fini (kptllnd_thread_data_t *thread_data)
+{
+        atomic_dec (&thread_data->kptllnd_data->kptl_nthreads);
+        PORTAL_FREE(thread_data,sizeof(*thread_data));
+}
+
+int
+kptllnd_thread_start (int (*fn)(void *arg), int id,kptl_data_t *kptllnd_data)
+{
+        long pid;
+        kptllnd_thread_data_t *thread_data;
+
+        /*
+         * Allocate the tread data so we can pass more that
+         * one param to the thread function
+         */
+        PORTAL_ALLOC (thread_data,sizeof(*thread_data));
+        if(thread_data == 0){
+                CERROR("No memory to allocated thread data structure\n");
+                return 0;
+        }
+
+        atomic_inc (&kptllnd_data->kptl_nthreads);
+
+        /*
+         * Initialize thread data structure
+         */
+        thread_data->id = id;
+        thread_data->kptllnd_data = kptllnd_data;
+
+        pid = kernel_thread (fn, thread_data, 0);
+
+        /*
+         * On error cleanup the context explicitly
+         */
+        if (pid < 0){
+                CERROR("Failed to start kernel_thread id=%d\n",id);
+                kptllnd_thread_fini(thread_data);
+                return (int)pid;
+        }else{
+                return 0;
+        }
+}
+
+
+
+int
+kptllnd_scheduler(void *arg)
+{
+        kptllnd_thread_data_t *thread_data = arg;
+        int             id = thread_data->id;
+        kptl_data_t    *kptllnd_data = thread_data->kptllnd_data;
+        char            name[16];
+        cfs_waitlink_t  waitlink;
+        int             bucket =0;
+        int             buckets_to_check;
+        cfs_time_t      last_check = cfs_time_current();
+        cfs_duration_t  duration;
+        time_t          duration_sec;
+
+        PJK_UT_MSG(">>>\n");
+
+        /*
+         * Daemonize
+         */
+        snprintf(name, sizeof(name), "kptllnd_sd_%02d", id);
+        libcfs_daemonize(name);
+
+        cfs_waitlink_init(&waitlink);
+
+        /*
+         * Keep going around
+         */
+        while(!kptllnd_data->kptl_shutdown) {
+
+                /*
+                 * Wait on the scheduler waitq
+                 */
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                cfs_waitq_add(&kptllnd_data->kptl_sched_waitq, &waitlink);
+                cfs_waitq_timedwait(&waitlink,cfs_time_seconds(PTLLND_TIMEOUT_SEC));
+                set_current_state (TASK_RUNNING);
+                cfs_waitq_del (&kptllnd_data->kptl_sched_waitq, &waitlink);
+
+
+                duration = cfs_time_sub(cfs_time_current(),last_check);
+                duration_sec = cfs_duration_sec(duration);
+
+                /*
+                 * Check all the buckets over the kptl_timeout inteval
+                 * but just determine what percenations we are supposed to be
+                 * checking now.
+                 * Example
+                 * (duration/HZ) = 5 sec
+                 * HASH_SHZE = 100
+                 * kptl_timeout = 60 sec.
+                 * Result = 8 buckets to be checked (actually 8.3)
+                 */
+                buckets_to_check = duration_sec * kptllnd_data->kptl_peer_hash_size /
+                                   (*kptllnd_tunables.kptl_timeout);
+
+                if(buckets_to_check){
+                        /*PJK_UT_MSG("Check Buckets %d\n",buckets_to_check);*/
+                        STAT_UPDATE(kps_checking_buckets);
+
+                        /*
+                         * Because we round down the buckets we need to store
+                         * the left over portion (.3 in the above example)
+                         * somewhere so we don't
+                         * lose it.  Do this but updating the last check now
+                         * to "now" but rather to some time less than "now" that
+                         * takes into account the routing error.
+                         */
+                        last_check = cfs_time_add( last_check,
+                                cfs_time_seconds(buckets_to_check *
+                                        *kptllnd_tunables.kptl_timeout /
+                                        kptllnd_data->kptl_peer_hash_size));
+
+                        /*
+                         * If we are supposed to check buckets then
+                         * do that here.
+                         */
+                        while(buckets_to_check){
+                                kptllnd_peer_check_bucket(bucket,kptllnd_data);
+                                bucket = (bucket+1) % kptllnd_data->kptl_peer_hash_size;
+                                --buckets_to_check;
+                        }
+                }
+
+                /*
+                 * Drain the RX queue
+                 */
+                while(kptllnd_process_scheduled_rx(kptllnd_data)!=0);
+
+                /*
+                 * Repost all RXBs
+                 */
+                while(kptllnd_process_scheduled_rxb(kptllnd_data)!=0);
+
+                /*
+                 * Drain the TX queue.  Note RX's can cause new TX's
+                 * to be added to the queue.
+                 */
+                while(kptllnd_process_scheduled_tx(kptllnd_data)!=0);
+
+
+                /*
+                 * Clean any canceled peers
+                 */
+                kptllnd_clean_canceled_peers(kptllnd_data);
+        }
+
+        kptllnd_thread_fini(thread_data);
+        PJK_UT_MSG("<<<\n");
+        return (0);
+}
+
+
+int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data)
+{
+        kptl_tx_t      *tx = 0;
+        unsigned long   flags;
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+        /*
+         * If the list is not empty, grab the first one
+         * and pull it off the list
+         */
+        if(!list_empty(&kptllnd_data->kptl_sched_txq)){
+                tx = list_entry (kptllnd_data->kptl_sched_txq.next,
+                                 kptl_tx_t, tx_schedlist);
+                list_del_init(&tx->tx_schedlist);
+        }
+
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+        if(tx){
+                PJK_UT_MSG(">>> tx=%p\n",tx);
+                kptllnd_tx_done(tx);
+                PJK_UT_MSG("<<<\n");
+        }
+
+        return tx!=NULL;
+}
+
+int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data)
+{
+        kptl_rx_t      *rx = 0;
+        unsigned long   flags;
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+        /*
+         * If the list is not empty, grab the first one
+         * and pull it off the list
+         */
+        if(!list_empty(&kptllnd_data->kptl_sched_rxq)){
+                rx = list_entry (kptllnd_data->kptl_sched_rxq.next,
+                                 kptl_rx_t, rx_list);
+                list_del_init(&rx->rx_list);
+        }
+
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+        if(rx)
+                kptllnd_rx_scheduler_handler(rx);
+
+        return rx!=NULL;
+}
+
+int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data)
+{
+        kptl_rx_buffer_t       *rxb = 0;
+        unsigned long           flags;
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+        /*
+         * If the list is not empty, grab the first one
+         * and pull it off the list
+         */
+        if(!list_empty(&kptllnd_data->kptl_sched_rxbq)){
+                rxb = list_entry (kptllnd_data->kptl_sched_rxbq.next,
+                                 kptl_rx_buffer_t, rxb_repost_list);
+                list_del_init(&rxb->rxb_repost_list);
+        }
+
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+        if(rxb)
+                kptllnd_rx_buffer_post_handle_error(rxb);
+
+        return rxb!=NULL;
+}
+
+void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data)
+{
+        unsigned long           flags;
+        kptl_peer_t            *peer;
+        struct list_head       *iter;
+        int                     counter;
+
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+
+        if(!list_empty(&kptllnd_data->kptl_canceled_peers)){
+                PJK_UT_MSG("Cleaning Canceled Peers\n");
+                STAT_UPDATE(kps_cleaning_caneled_peers);
+        }
+
+again:
+        counter = kptllnd_data->kptl_canceled_peers_counter;
+
+        list_for_each(iter, &kptllnd_data->kptl_canceled_peers) {
+                peer = list_entry (iter, kptl_peer_t, peer_list);
+
+
+                /*
+                 * Take reference so we can manipulate it
+                 * outside the lock
+                 * */
+                kptllnd_peer_addref(peer,"temp");
+
+                read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+                kptllnd_peer_cancel(peer);
+                kptllnd_peer_decref(peer,"temp");
+
+                read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+                /*
+                 * if the list has changed then we need to start again
+                 */
+                if(counter != kptllnd_data->kptl_canceled_peers_counter)
+                        goto again;
+        }
+
+        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+}
diff --git a/lnet/klnds/ptllnd/ptllnd_modparams.c b/lnet/klnds/ptllnd/ptllnd_modparams.c
new file mode 100644 (file)
index 0000000..f9ff67e
--- /dev/null
@@ -0,0 +1,168 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "ptllnd.h"
+
+static int ntx = PTLLND_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of 'normal' message descriptors");
+
+static int ntx_nblk = PTLLND_NTX_NBLK;
+CFS_MODULE_PARM(ntx_nblk, "i", int, 0444,
+               "# of 'reserved' message descriptors");
+
+static int concurrent_peers = PTLLND_CONCURRENT_PEERS;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = PTLLND_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = PTLLND_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int portal = PTLLND_PORTAL;
+CFS_MODULE_PARM(portal, "i", int, 0444,
+               "portal id");
+
+static int rxb_npages = PTLLND_RXB_NPAGES;
+CFS_MODULE_PARM(rxb_npages, "i", int, 0444,
+               "# of pages for rx buffers");
+
+static int credits = PTLLND_CREDITS;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "concurrent sends");
+
+static int peercredits = PTLLND_PEERCREDITS;
+CFS_MODULE_PARM(peercredits, "i", int, 0444,
+               "concurrent sends to 1 peer");
+
+static int max_immd_size = PTLLND_MAX_MSG_SIZE;
+CFS_MODULE_PARM(max_immd_size, "i", int, 0444,
+               "max size of immediate message");
+
+static int peer_hash_table_size = PTLLND_PEER_HASH_SIZE;
+CFS_MODULE_PARM(peer_hash_table_size, "i", int, 0444,
+               "# of slots in the peer hash table");
+
+#ifdef PJK_DEBUGGING
+static int simulation_bitmap = 0;
+CFS_MODULE_PARM(simulation_bitmap, "i", int, 0444,
+               "simulation bitmap");
+#endif         
+
+
+kptl_tunables_t kptllnd_tunables = {
+        .kptl_ntx                    = &ntx,
+        .kptl_ntx_nblk               = &ntx_nblk,
+        .kptl_concurrent_peers       = &concurrent_peers,
+        .kptl_cksum                  = &cksum,
+        .kptl_portal                 = &portal,
+        .kptl_timeout                = &timeout,
+        .kptl_rxb_npages             = &rxb_npages,
+        .kptl_credits                = &credits,
+        .kptl_peercredits            = &peercredits,
+        .kptl_max_immd_size          = &max_immd_size,
+        .kptl_peer_hash_table_size   = &peer_hash_table_size,
+#ifdef PJK_DEBUGGING        
+        .kptl_simulation_bitmap      = &simulation_bitmap,
+#endif        
+};
+
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static ctl_table kptllnd_ctl_table[] = {
+       {1, "ntx", &ntx,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "ntx_nblk", &ntx_nblk,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {3, "concurrent_peers", &concurrent_peers,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {4, "cksum", &cksum,
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {5, "timeout", &timeout,
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "portal", &portal,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "rxb_npages", &rxb_npages,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {8, "credits", &kptl_credits,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {9, "peercredits", &kptl_peercredits,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "max_immd_size", &kptl_max_immd_size,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "peer_hash_table_size,", &kptl_peer_hash_table_size,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+        
+#ifdef PJK_DEBUGGING    
+       {12, "simulation_bitmap,", &kptl_simulation_bitmap,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#endif
+        
+       {0}
+};
+
+static ctl_table kptllnd_top_ctl_table[] = {
+       {203, "ptllnd", NULL, 0, 0555, kptllnd_ctl_table},
+       {0}
+};
+
+int
+kptllnd_tunables_init ()
+{
+       kptllnd_tunables.kptl_sysctl =
+               register_sysctl_table(kptllnd_top_ctl_table, 0);
+
+       if (kptllnd_tunables.kptl_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+       if (kptllnd_tunables.kptl_sysctl != NULL)
+               unregister_sysctl_table(kptllnd_tunables.kptl_sysctl);
+}
+
+#else
+
+int
+kptllnd_tunables_init ()
+{
+       return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+}
+
+#endif
+
diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c
new file mode 100644 (file)
index 0000000..bbe1979
--- /dev/null
@@ -0,0 +1,1315 @@
+#include "ptllnd.h"
+#include <libcfs/list.h>
+
+void
+kptllnd_peer_destroy (
+        kptl_peer_t *peer);
+
+kptl_peer_t *
+kptllnd_peer_find_locked (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid);
+
+
+
+int
+kptllnd_peer_create_locked (
+        kptl_data_t *kptllnd_data,
+        kptl_peer_t **peerp,
+        lnet_nid_t nid)
+{
+        kptl_peer_t     *peer;
+        int             rc;
+
+        PJK_UT_MSG(">>> nid="LPX64"\n",nid);
+
+        LASSERT (nid != PTL_NID_ANY);
+
+        /*
+         * But first check we haven't exceeded or maximum
+         * number of peers
+         */
+        if (atomic_read(&kptllnd_data->kptl_npeers) >=
+            *kptllnd_tunables.kptl_concurrent_peers) {
+                STAT_UPDATE(kps_too_many_peers);
+                CERROR("Can't create peer: too many peers\n");
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        }
+
+        PORTAL_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Cannot allocate memory for peer\n");
+                return -ENOMEM;
+        }
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        INIT_LIST_HEAD (&peer->peer_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD (&peer->peer_pending_txs);
+        INIT_LIST_HEAD (&peer->peer_active_txs);
+        spin_lock_init (&peer->peer_lock);
+
+
+        peer->peer_state = PEER_STATE_WAITING_HELLO;
+        peer->peer_kptllnd_data = kptllnd_data;
+        peer->peer_nid = nid;
+        //peer->peer_incarnation = 0;
+        //peer->peer_tx_seqnum = 0;
+
+        /*
+         * Just enough to send the connect message
+         */
+        peer->peer_credits = 1;
+
+        /*
+         * We just posted this many buffers ready for the peer
+         * to send into, so give back this many credits
+         */
+        peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1;
+
+
+        peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+        //peer->peer_last_matchbits_seen = 0;
+
+
+        /*
+         * Reserve space in the RX buffer pool for this new peer
+         */
+        rc = kptllnd_rx_buffer_pool_reserve(
+                &kptllnd_data->kptl_rx_buffer_pool,
+                kptllnd_data,
+                *kptllnd_tunables.kptl_peercredits);
+        if(rc != 0){
+                CERROR("Cannot reserve rx buffer pool space\n");
+                PORTAL_FREE(peer, sizeof (*peer));
+                return rc;
+        }
+
+        /*
+         * 1 ref for the list
+         * 1 for the caller
+         */
+        atomic_set (&peer->peer_refcount, 2);
+
+        /* npeers only grows with the global lock held */
+        atomic_inc(&kptllnd_data->kptl_npeers);
+
+        /* And add this to the list */
+        list_add_tail (&peer->peer_list,
+                       kptllnd_nid2peerlist (kptllnd_data,nid));
+
+        STAT_UPDATE(kps_peers_created);
+
+        PJK_UT_MSG("<<< Peer=%p nid="LPX64"\n",peer,nid);
+        *peerp = peer;
+        return 0;
+}
+
+
+void
+kptllnd_peer_destroy (
+        kptl_peer_t *peer)
+{
+        kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+
+        PJK_UT_MSG("Peer=%p\n",peer);
+
+        LASSERT (atomic_read (&peer->peer_refcount) == 0);
+        /* Not on the peer list */
+        LASSERT (list_empty (&peer->peer_list));
+        /* No pending tx descriptors */
+        LASSERT (list_empty (&peer->peer_pending_txs));
+        /* No active tx descriptors */
+        LASSERT (list_empty (&peer->peer_active_txs));
+
+        PORTAL_FREE (peer, sizeof (*peer));
+
+        kptllnd_rx_buffer_pool_unreserve(
+                &kptllnd_data->kptl_rx_buffer_pool,
+                *kptllnd_tunables.kptl_peercredits);
+
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec(&kptllnd_data->kptl_npeers);
+}
+
+
+void
+kptllnd_peer_addref (
+        kptl_peer_t *peer,
+        const char *owner)
+{
+        atomic_inc(&peer->peer_refcount);
+
+        /*
+         * The below message could actually be out of sync
+         * with the real ref count, and is for informational purposes
+         * only
+         */
+        PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
+                atomic_read(&peer->peer_refcount));
+}
+
+void
+kptllnd_peer_decref (
+        kptl_peer_t *peer,
+        const char *owner)
+{
+        unsigned long    flags;
+        kptl_data_t     *kptllnd_data = peer->peer_kptllnd_data;
+
+        if( !atomic_dec_and_test(&peer->peer_refcount)){
+
+                /*
+                 * The below message could actually be out of sync
+                 * with the real ref count, and is for informational purposes
+                 * only
+                 */
+                PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
+                        atomic_read(&peer->peer_refcount));
+                return;
+        }
+
+        PJK_UT_MSG("peer=%p owner=%s LAST REF\n",peer,owner);
+
+        write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+        list_del_init (&peer->peer_list);
+        if(peer->peer_state == PEER_STATE_CANCELED)
+                kptllnd_data->kptl_canceled_peers_counter++;
+        write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        kptllnd_peer_destroy(peer);
+}
+
+
+void
+kptllnd_peer_cancel_pending_txs(
+        kptl_peer_t *peer)
+{
+        struct list_head   list;
+        struct list_head  *tx_temp;
+        struct list_head  *tx_next;
+        kptl_tx_t         *tx;
+
+
+        INIT_LIST_HEAD (&list);
+
+        /*
+         * Tranfer all the PENDING TX's to a temporary list
+         * while holding the peer lock
+         */
+        spin_lock(&peer->peer_lock);
+
+        if(!list_empty(&peer->peer_pending_txs))
+                PJK_UT_MSG("Clearing Pending TXs\n");
+
+        list_for_each_safe (tx_temp, tx_next, &peer->peer_pending_txs) {
+                tx = list_entry (tx_temp, kptl_tx_t, tx_list);
+
+                list_del_init(&tx->tx_list);
+                list_add(&tx->tx_list,&list);
+        }
+
+        spin_unlock(&peer->peer_lock);
+
+        /*
+         * Now relese the refereces outside of the peer_lock
+         */
+        list_for_each_safe (tx_temp, tx_next, &list) {
+                tx = list_entry (tx_temp, kptl_tx_t, tx_list);
+                list_del_init(&tx->tx_list);
+                kptllnd_tx_decref(tx);
+        }
+}
+
+void
+kptllnd_peer_cancel_active_txs(
+        kptl_peer_t *peer)
+{
+        struct list_head  *iter;
+        kptl_tx_t         *tx;
+        ptl_err_t          ptl_rc;
+        int                counter;
+
+        spin_lock(&peer->peer_lock);
+
+        if(!list_empty(&peer->peer_active_txs))
+                PJK_UT_MSG("Clearing Active TXs\n");
+
+again:
+
+        counter = peer->peer_active_txs_change_counter;
+
+        list_for_each (iter, &peer->peer_active_txs) {
+                tx = list_entry (iter, kptl_tx_t, tx_list);
+
+                /*
+                 * Hold onto one ref so we can make these
+                 * unlink calls even though we have
+                 * released the lock
+                 */
+                kptllnd_tx_addref(tx);
+
+                spin_unlock(&peer->peer_lock);
+
+
+                /*
+                 * Question:  Why is it safe to acces tx_mdh and tx_mdh
+                 * outside the peer_lock.  We could be racing with
+                 * tx_callback?
+                 */
+
+                if(!PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)){
+                        PJK_UT_MSG("Unlink mhd_msg\n");
+                        LASSERT(atomic_read(&tx->tx_refcount)>1);
+                        ptl_rc = PtlMDUnlink(tx->tx_mdh_msg);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                        if(ptl_rc == PTL_OK) {
+                                tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+                                kptllnd_tx_decref(tx);
+                        }
+#endif
+                }
+
+                if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+                        PJK_UT_MSG("Unlink mdh\n");
+                        LASSERT(atomic_read(&tx->tx_refcount)>1);
+                        ptl_rc = PtlMDUnlink(tx->tx_mdh);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                        if(ptl_rc == PTL_OK){
+                                tx->tx_mdh = PTL_INVALID_HANDLE;
+                                kptllnd_tx_decref(tx);
+                        }
+#endif
+                }
+
+                kptllnd_tx_decref(tx);
+
+                spin_lock(&peer->peer_lock);
+
+                /*
+                 * If a change in the list has be detected
+                 * go back to the beginning
+                 */
+                if( counter != peer->peer_active_txs_change_counter)
+                        goto again;
+        }
+
+        spin_unlock(&peer->peer_lock);
+}
+
+void
+kptllnd_peer_cancel(
+        kptl_peer_t *peer)
+{
+        kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+        unsigned long      flags;
+        int                list_owns_ref=0;
+
+        PJK_UT_MSG(">>> Peer=%p\n",peer);
+
+        write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+        if(peer->peer_state != PEER_STATE_CANCELED){
+                peer->peer_state = PEER_STATE_CANCELED;
+                list_del_init(&peer->peer_list);
+                list_add(&peer->peer_list,&kptllnd_data->kptl_canceled_peers);
+                kptllnd_data->kptl_canceled_peers_counter++;
+                list_owns_ref = 1;
+        }
+        write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+
+        /*
+         * First cancel the pending and active TXs
+         */
+        kptllnd_peer_cancel_pending_txs(peer);
+        kptllnd_peer_cancel_active_txs(peer);
+
+
+        /* lose peerlist's ref as long as we haven't done
+           this before */
+        if(list_owns_ref)
+                kptllnd_peer_decref(peer,"list");
+
+        PJK_UT_MSG("<<< Peer=%p\n",peer);
+}
+
+int
+kptllnd_peer_del (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid)
+{
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kptl_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        unsigned long      flags;
+        int                rc = -ENOENT;
+
+
+        PJK_UT_MSG(">>> NID="LPX64"\n",nid);
+
+        /*
+         * Find the single bucket we are supposed to look at
+         * or if nid = PTL_NID_ANY then look at all of the buckets
+         */
+        if (nid != PTL_NID_ANY)
+                lo = hi = kptllnd_nid2peerlist(kptllnd_data,nid) - kptllnd_data->kptl_peers;
+        else {
+                lo = 0;
+                hi = kptllnd_data->kptl_peer_hash_size - 1;
+        }
+
+again:
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kptllnd_data->kptl_peers[i]) {
+                        peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+                        /*
+                         * Is this the right one?
+                         */
+                        if (!(nid == PTL_NID_ANY || peer->peer_nid == nid))
+                                continue;
+
+                        kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */
+
+                        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,
+                                               flags);
+
+                        kptllnd_peer_cancel(peer);
+                        kptllnd_peer_decref(peer,"temp"); /* ...until here */
+
+                        rc = 0;         /* matched something */
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        PJK_UT_MSG("<<< rc=%d\n",rc);
+        return (rc);
+}
+
+void
+kptllnd_peer_queue_tx_locked (
+        kptl_peer_t *peer,
+        kptl_tx_t *tx)
+{
+        PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+
+        LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+        LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
+        tx->tx_state = TX_STATE_WAITING_CREDITS;
+        LASSERT(tx->tx_peer == NULL);
+
+        kptllnd_peer_addref(peer,"tx");
+        tx->tx_peer = peer;
+
+        tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+        list_add_tail(&tx->tx_list, &peer->peer_pending_txs);
+}
+
+void
+kptllnd_peer_queue_tx (
+        kptl_peer_t *peer,
+        kptl_tx_t *tx)
+{
+        spin_lock(&peer->peer_lock);
+        kptllnd_peer_queue_tx_locked (peer, tx);
+        spin_unlock(&peer->peer_lock);
+
+        kptllnd_peer_check_sends(peer);
+}
+
+
+void
+kptllnd_peer_queue_bulk_rdma_tx_locked(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx)
+{
+        PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+
+        LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+        LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
+        tx->tx_state = TX_STATE_WAITING_RESPONSE;
+
+        LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE ||
+                tx->tx_type == TX_TYPE_LARGE_GET_RESPONSE);
+
+        LASSERT(tx->tx_peer == NULL);
+        kptllnd_peer_addref(peer,"tx");
+        tx->tx_peer = peer;
+        tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+
+        list_add_tail(&tx->tx_list, &peer->peer_active_txs);
+        peer->peer_active_txs_change_counter++;
+}
+
+void
+kptllnd_peer_dequeue_tx_locked(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx)
+{
+        list_del_init(&tx->tx_list);
+        /*
+         * The tx could be on the active list
+         * or possibily the passive list.  Either way
+         * we'll be safe an update the active txs list counter
+         * (this counter only indicates change, and in this
+         * case it's possible change, which is an acceptable
+         * usage)
+         */
+        peer->peer_active_txs_change_counter++;
+}
+
+void
+kptllnd_peer_dequeue_tx(
+        kptl_peer_t *peer,
+        kptl_tx_t *tx)
+{
+        spin_lock(&peer->peer_lock);
+        kptllnd_peer_dequeue_tx_locked(peer,tx);
+        spin_unlock(&peer->peer_lock);
+}
+
+void
+kptllnd_peer_check_sends (
+        kptl_peer_t *peer )
+{
+
+        kptl_tx_t      *tx;
+        kptl_data_t    *kptllnd_data = peer->peer_kptllnd_data;
+        int             rc,rc2;
+        ptl_md_t        md;
+        ptl_handle_me_t meh;
+        ptl_process_id_t target;
+
+        /*
+         * If there is nothing to send, and we have hit the credit
+         * high water mark, then send a no-op message
+         */
+        spin_lock(&peer->peer_lock);
+
+        PJK_UT_MSG_DATA(">>>Peer=%p Credits=%d Outstanding=%d\n",
+                peer,peer->peer_credits,peer->peer_outstanding_credits);
+
+        if(list_empty(&peer->peer_pending_txs) &&
+           peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) {
+
+                /*
+                 * Get an idle tx descriptor
+                 * may NOT block: (That's the "0" param)
+                 */
+                tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+                if( tx == NULL ) {
+                        CERROR ("Can't return credits to "LPX64": tx descs exhausted\n",
+                                peer->peer_nid);
+                }else{
+                        kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP,0);
+                        kptllnd_peer_queue_tx_locked(peer,tx);
+                }
+        }
+        /*
+         * Now go through all the sends to see what we can send
+         */
+        while(!list_empty(&peer->peer_pending_txs)) {
+                tx = list_entry (peer->peer_pending_txs.next, kptl_tx_t, tx_list);
+
+                LASSERT (tx->tx_state == TX_STATE_WAITING_CREDITS);
+                LASSERT (peer->peer_outstanding_credits >= 0);
+                LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits);
+                LASSERT (peer->peer_credits >= 0);
+                LASSERT (peer->peer_credits <= *kptllnd_tunables.kptl_peercredits);
+
+                /*
+                 * If there are no credits we're done
+                 */
+                if (peer->peer_credits == 0) {
+                        STAT_UPDATE(kps_no_credits);
+                        CDEBUG(D_NET, LPX64": no credits\n",peer->peer_nid);
+                        break;
+                }
+
+
+                /*
+                 * If there is one credit but we have no credits to give
+                 * back then we don't use our one credit and we are done
+                 */
+                if (peer->peer_credits == 1 &&
+                    peer->peer_outstanding_credits == 0) {
+                        STAT_UPDATE(kps_saving_last_credit);
+                        CDEBUG(D_NET, LPX64": not using last credit\n",
+                               peer->peer_nid);
+                        break;
+                }
+
+                /*
+                 * Remove the tx from the list.  We don't decrement the
+                 * ref count here.  The reference is simply transferred from
+                 * the Peer to this calling function, and it will be this
+                 * functions responsibility to dispose of the reference properly
+                 */
+                list_del_init(&tx->tx_list);
+
+                /*
+                 * If there is a NOOP in the queue but there
+                 * are pending tx buffers also in the queue
+                 *
+                 * OR we are not at the high-water mark anymore
+                 *
+                 * THEN it is safe to simply discard this NOOP
+                 * and continue one.
+                 */
+                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
+                    (!list_empty(&peer->peer_pending_txs) ||
+                     peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
+                        /* redundant NOOP */
+                        kptllnd_tx_decref(tx);
+                        CDEBUG(D_NET, LPX64": redundant noop\n",
+                               peer->peer_nid);
+                        continue;
+                }
+
+                PJK_UT_MSG_DATA("--- TXTXTXTXTXTXTXTXTXTXTXTXTXTX\n");
+                PJK_UT_MSG_DATA("Sending TX=%p Size=%d\n",tx,tx->tx_msg->ptlm_nob);
+                PJK_UT_MSG_DATA("Target nid="LPX64"\n",peer->peer_nid);
+
+
+                /*
+                 * Assign matchbits for a put/get
+                 */
+                if(tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT ||
+                   tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET){
+
+                        PJK_UT_MSG_DATA("next matchbits="LPX64" (before)\n",
+                                peer->peer_next_matchbits);
+
+
+                        /* Allocate a new match bits value.  It might not be needed,
+                         * but we've got a lock right now and we're unlikely to
+                         * wrap...
+                         *
+                         * A set of match bits at the low end are reserved.  So we can
+                         * not use them.  Just skip over them.  This check protects us
+                         * even in the case of 64-bit rollover.
+                         */
+                        if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS){
+                                CDEBUG(D_INFO,"Match Bits Rollover for "LPX64"\n",
+                                        peer->peer_nid);
+                                peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+
+                        }
+
+                        /*
+                         * Set the payload match bits and update the peer's counter
+                         */
+                        tx->tx_msg->ptlm_u.req.kptlrm_matchbits =
+                                peer->peer_next_matchbits ++;
+
+                        PJK_UT_MSG_DATA("next matchbits="LPX64" (after)\n",
+                                peer->peer_next_matchbits);
+                }
+
+                /*
+                 * Complete the message fill in all the rest
+                 * of the header
+                 */
+                kptllnd_msg_pack(
+                        tx->tx_msg,
+                        peer->peer_outstanding_credits,
+                        peer->peer_nid,
+                        peer->peer_incarnation,
+                        peer->peer_tx_seqnum,
+                        kptllnd_data);
+
+                /*
+                 * We just sent a packet
+                 */
+                peer->peer_tx_seqnum++;
+
+                /*
+                 * And we've returned all of our credits
+                 */
+                peer->peer_outstanding_credits = 0;
+
+                /*
+                 * And we have one less credit :-(
+                 */
+                peer->peer_credits--;
+
+                /*
+                 * Set the state before the PtlPut() because
+                 * we could get the PUT_END callback before PtlPut()
+                 * returns.
+                 */
+                LASSERT(tx->tx_state == TX_STATE_WAITING_CREDITS);
+                tx->tx_state = TX_STATE_WAITING_RESPONSE;
+
+                /*
+                 * Construct an address that Portals needs from the NID
+                 */
+
+                target.nid = lnet2ptlnid(kptllnd_data,peer->peer_nid);
+                target.pid = 0;
+
+                PJK_UT_MSG_DATA("Msg NOB = %d\n",tx->tx_msg->ptlm_nob);
+                PJK_UT_MSG_DATA("Returned Credits=%d\n",tx->tx_msg->ptlm_credits);
+                PJK_UT_MSG_DATA("Seq # = "LPX64"\n",tx->tx_msg->ptlm_seq);
+
+                PJK_UT_MSG("lnet TX nid=" LPX64 "\n",peer->peer_nid);
+                PJK_UT_MSG("ptl  TX nid=" LPX64 "\n",target.nid);
+
+                if(tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET ||
+                   tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT){
+                        tempiov_t tempiov;
+
+#ifdef TESTING_WITH_LOOPBACK
+                        /*
+                         * When doing loopback testing the data comes back
+                         * on the given loopback nid
+                         */
+                        ptl_process_id_t target;
+                        target.nid = PTL_NID_ANY;
+                        target.pid = 0;
+#endif
+
+                        PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+                                tx->tx_msg->ptlm_u.req.kptlrm_matchbits);
+
+
+                        /*
+                         * Attach the ME
+                         */
+                        rc = PtlMEAttach(
+                            kptllnd_data->kptl_nih,
+                            *kptllnd_tunables.kptl_portal,
+                            target,
+                            tx->tx_msg->ptlm_u.req.kptlrm_matchbits,
+                            0, /* all matchbits are valid - ignore none*/
+                            PTL_UNLINK,
+                            PTL_INS_BEFORE,
+                            &meh);
+                        if(rc != 0) {
+                                CERROR("PtlMeAttach failed %d\n",rc);
+                                goto failed;
+                        }
+
+                        /* Setup the MD */
+                        kptllnd_setup_md(kptllnd_data,&md,
+                                tx->tx_msg->ptlm_type == LNET_MSG_GET ? PTL_MD_OP_PUT :
+                                        PTL_MD_OP_GET,
+                                tx,
+                                tx->tx_payload_niov,
+                                tx->tx_payload_iov,
+                                tx->tx_payload_kiov,
+                                tx->tx_payload_offset,
+                                tx->tx_payload_nob,
+                                &tempiov);
+
+                        /*
+                         * Add a ref for this MD, because unlink
+                         * events can happen at any time once
+                         * something is posted.
+                         */
+                        kptllnd_tx_addref(tx);
+
+                        /*
+                         * Attach the MD
+                         */
+                        rc = PtlMDAttach(
+                                meh,
+                                md,
+                                PTL_UNLINK,
+                                &tx->tx_mdh);
+                        if(rc != 0){
+                                CERROR("PtlMDAttach failed %d\n",rc);
+
+                                /*
+                                 * Just drop the ref for this MD because it was never
+                                 * posted to portals
+                                 */
+                                tx->tx_mdh = PTL_INVALID_HANDLE;
+                                kptllnd_tx_decref(tx);
+
+                                rc2 = PtlMEUnlink(meh);
+                                LASSERT(rc2 == 0);
+                                goto failed;
+                        }
+                }
+
+
+                /*
+                 * Setup the MD
+                 */
+                md.start = tx->tx_msg;
+                md.length = tx->tx_msg->ptlm_nob;
+                md.threshold = 1;
+                md.options = PTL_MD_OP_PUT;
+                md.options |= PTL_MD_EVENT_START_DISABLE;
+                /* we don't need an ACK, we'll get a callback when the get is complete */
+                md.options |= PTL_MD_ACK_DISABLE;
+                md.user_ptr = tx;
+                md.eq_handle = kptllnd_data->kptl_eqh;
+
+
+                /*
+                 * Bind the MD
+                 */
+                rc = PtlMDBind (
+                        kptllnd_data->kptl_nih,
+                        md,
+                        PTL_UNLINK,
+                        &tx->tx_mdh_msg);
+                if(rc != 0){
+                        CERROR("PtlMDBind failed %d\n",rc);
+                        tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+                        goto failed;
+                }
+
+                list_add_tail(&tx->tx_list, &peer->peer_active_txs);
+                peer->peer_active_txs_change_counter++;
+                LASSERT(tx->tx_peer == peer);
+
+                /*
+                 * Grab a ref so the TX doesn't go away
+                 * if we fail.
+                 */
+                kptllnd_tx_addref(tx);
+
+                spin_unlock(&peer->peer_lock);
+
+                rc = PtlPut (
+                            tx->tx_mdh_msg,
+                            PTL_NOACK_REQ,     /* we dont need an ack */
+                            target,            /* peer "address" */
+                            *kptllnd_tunables.kptl_portal,     /* portal */
+                            0,                 /* cookie */
+                            LNET_MSG_MATCHBITS, /* match bits */
+                            0,                 /* offset */
+                            0);                /* header data */
+                if(rc != 0){
+                        CERROR("PtlPut error %d\n",rc);
+
+                        /*
+                         * Do the unlink which should succeed
+                         */
+                        LASSERT(atomic_read(&tx->tx_refcount)>1);
+                        rc2 = PtlMDUnlink(tx->tx_mdh_msg);
+                        LASSERT( rc2 == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                        tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+                        kptllnd_tx_decref(tx);
+#endif
+                        goto failed;
+                }
+
+                /*
+                 * Release our temporary reference
+                 */
+                kptllnd_tx_decref(tx);
+
+                spin_lock(&peer->peer_lock);
+
+        }
+
+
+        spin_unlock(&peer->peer_lock);
+
+        PJK_UT_MSG_DATA("<<<\n");
+        return;
+
+failed:
+
+        /*
+         * Now unlink the MDs (if they were posted)
+         */
+        if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+                LASSERT(atomic_read(&tx->tx_refcount)>1);
+                rc2 = PtlMDUnlink(tx->tx_mdh);
+                /*
+                 * The unlink should succeed
+                 */
+                LASSERT( rc2 == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                kptllnd_tx_decref(tx);
+#endif
+        }
+
+        /*
+         * Get back the credits
+         * ??? WHY even do this because we're killing the peer
+         */
+        peer->peer_outstanding_credits += tx->tx_msg->ptlm_credits;
+        peer->peer_credits++;
+
+        spin_unlock(&peer->peer_lock);
+
+        /*
+         * And cleanup this peer
+         */
+        kptllnd_peer_cancel(peer);
+
+        /*
+         * And release the tx reference
+         */
+        kptllnd_tx_decref(tx);
+
+        PJK_UT_MSG("<<< FAILED\n");
+}
+
+int
+kptllnd_peer_timedout(kptl_peer_t *peer)
+{
+        kptl_tx_t          *tx;
+
+        spin_lock(&peer->peer_lock);
+
+        /*
+         * Check the head of the pending list for expiration
+         * this is a queue, so if the head isn't expired then nothing
+         * else will be expired
+         */
+        if(!list_empty(&peer->peer_pending_txs)){
+                tx = list_entry(peer->peer_pending_txs.next,kptl_tx_t,tx_list);
+                if(time_after_eq(jiffies,tx->tx_deadline)){
+                        spin_unlock(&peer->peer_lock);
+                        PJK_UT_MSG("Peer=%p PENDING tx=%p time=%lu sec\n",
+                                peer,tx,(jiffies - tx->tx_deadline)/HZ);
+                        return 1;
+                }
+        }
+
+        /*
+         * Check the head of the active list
+         */
+        if(!list_empty(&peer->peer_active_txs)){
+                tx = list_entry(peer->peer_active_txs.next,kptl_tx_t,tx_list);
+                if(time_after_eq(jiffies,tx->tx_deadline)){
+                        spin_unlock(&peer->peer_lock);
+                        PJK_UT_MSG("Peer=%p ACTIVE tx=%p time=%lu sec\n",
+                                peer,tx,(jiffies - tx->tx_deadline)/HZ);
+                        return 1;
+                }
+        }
+
+        spin_unlock(&peer->peer_lock);
+        return 0;
+}
+
+
+void
+kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
+{
+        struct list_head  *peers = &kptllnd_data->kptl_peers[idx];
+        struct list_head  *ptmp;
+        kptl_peer_t       *peer;
+        unsigned long      flags;
+
+
+        /*PJK_UT_MSG("Bucket=%d\n",idx);*/
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+                /* In case we have enough credits to return via a
+                 * NOOP, but there were no non-blocking tx descs
+                 * free to do it last time... */
+                kptllnd_peer_check_sends(peer);
+
+                if (!kptllnd_peer_timedout(peer))
+                        continue;
+
+                kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */
+
+                read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,
+                                       flags);
+
+                CERROR("Timed out RDMA with "LPX64"\n",peer->peer_nid);
+
+                kptllnd_peer_cancel(peer);
+                kptllnd_peer_decref(peer,"temp"); /* ...until here */
+
+                /* start again now I've dropped the lock */
+                goto again;
+        }
+
+        read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+}
+
+kptl_peer_t *
+kptllnd_peer_find (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid)
+{
+        kptl_peer_t *peer;
+        unsigned long flags;
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+        peer = kptllnd_peer_find_locked(kptllnd_data,nid);
+        read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+        return peer;
+}
+
+kptl_peer_t *
+kptllnd_peer_find_locked (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid)
+{
+        struct list_head *peer_list = kptllnd_nid2peerlist (kptllnd_data,nid);
+        struct list_head *tmp;
+        kptl_peer_t      *peer;
+
+        PJK_UT_MSG(">>> nid="LPX64"\n",nid);
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+                LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+
+                if (peer->peer_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+                       peer, nid, atomic_read (&peer->peer_refcount));
+
+                kptllnd_peer_addref(peer,"find");
+                PJK_UT_MSG("<<< Peer=%p\n",peer);
+                return peer;
+        }
+
+        PJK_UT_MSG("<<< NOTFOUND\n");
+        return NULL;
+}
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (
+        kptl_data_t *kptllnd_data,
+        lnet_nid_t nid,
+        kptl_msg_t *msg)
+{
+        kptl_peer_t    *peer;
+        kptl_peer_t    *peer_to_cancel = 0;
+        unsigned long   flags;
+        kptl_tx_t      *tx_hello = 0;
+        int             rc;
+        __u64           safe_matchbits_from_peer;
+        __u64           safe_matchbits_to_peer = 0;
+
+
+        PJK_UT_MSG(">>>\n");
+
+        safe_matchbits_from_peer = msg->ptlm_u.hello.kptlhm_matchbits +
+                        *kptllnd_tunables.kptl_peercredits;
+
+        /*
+         * Immediate message sizes MUST be equal
+         */
+        if(  msg->ptlm_u.hello.kptlhm_max_immd_size !=
+                *kptllnd_tunables.kptl_max_immd_size){
+                CERROR("IMMD message size MUST be equal for all peers got %d expected %d\n",
+                        msg->ptlm_u.hello.kptlhm_max_immd_size,
+                        *kptllnd_tunables.kptl_max_immd_size);
+
+                return 0;
+        }
+
+        write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+        /*
+         * Look for peer because it could have been previously here
+         */
+        peer = kptllnd_peer_find_locked(kptllnd_data,nid);
+
+        /*
+         * If peer is already here
+         */
+        if(peer != NULL){
+
+                if(peer->peer_incarnation == 0) {
+                        /*
+                         * Update the peer state
+                         */
+                        LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO);
+                        peer->peer_state = PEER_STATE_ACTIVE;
+
+                        /*
+                         * Update the incarnation
+                         */
+                        peer->peer_incarnation = msg->ptlm_srcstamp;
+
+                        /*
+                         * Save the match bits
+                         */
+                        PJK_UT_MSG_DATA(" **** Updating Matchbits="LPX64" ****\n",
+                                safe_matchbits_from_peer);
+
+                        peer->peer_next_matchbits = safe_matchbits_from_peer;
+                        if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
+                                peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+                }
+
+                /*
+                 * If the incarnation has changed then we need to
+                 * resend the hello.
+                 */
+                else if( peer->peer_incarnation != msg->ptlm_srcnid ) {
+
+                        /*
+                         * Put the match bits into the hello message
+                         */
+                        safe_matchbits_to_peer =
+                                peer->peer_last_matchbits_seen + 1 +
+                                *kptllnd_tunables.kptl_peercredits;
+
+                        /*
+                         * Save this peer to cancel
+                         */
+                        peer_to_cancel = peer;
+                        peer = NULL;
+
+                }else{
+                        CERROR("Receiving HELLO message on already connected peer " LPX64"\n",nid);
+                }
+        }
+
+        if( peer == NULL) {
+
+                /*
+                 * Setup a connect HELLO message.  We ultimately might not
+                 * use it but likely we will.
+                 */
+                tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+                if( tx_hello == NULL) {
+                        CERROR("Unable to allocate connect message for "LPX64"\n",nid);
+                        goto failed;
+                }
+
+                kptllnd_init_msg(
+                        tx_hello->tx_msg,
+                        PTLLND_MSG_TYPE_HELLO,
+                        sizeof(kptl_hello_msg_t));
+                /*
+                 * Put the match bits into the hello message
+                 */
+                tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits =
+                        safe_matchbits_to_peer;
+                tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size =
+                        *kptllnd_tunables.kptl_max_immd_size;
+
+                rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, nid);
+                if(rc != 0){
+                        CERROR("Failed to create peer (nid="LPX64")\n",nid);
+                        write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+                        peer = NULL;
+                        goto failed;
+                }
+
+                LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO);
+                peer->peer_state = PEER_STATE_ACTIVE;
+
+                /*
+                 * NB We don't need to hold the peer->peer_lock
+                 * because we haven't released the kptl_peer_rw_lock which
+                 * holds prevents anyone else from getting a pointer to
+                 * this newly created peer
+                 */
+
+                /*
+                 * Update the incarnation
+                 */
+                peer->peer_incarnation = msg->ptlm_srcstamp;
+
+                /*
+                 * Save the match bits
+                 */
+                PJK_UT_MSG_DATA("**** Setting Matchbits="LPX64" ****\n",
+                        safe_matchbits_from_peer);
+                peer->peer_next_matchbits = safe_matchbits_from_peer;
+                if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
+                        peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+
+
+                /*
+                 * And save them from a previous incarnation
+                 */
+                peer->peer_last_matchbits_seen = safe_matchbits_to_peer;
+
+                /*
+                 * Queue the message
+                 */
+                kptllnd_peer_queue_tx_locked(peer,tx_hello);
+
+                /*
+                 * And don't free it because it's queued
+                 */
+                tx_hello = 0;
+
+        }
+
+failed:
+        write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,flags);
+
+        if(tx_hello)
+                kptllnd_tx_decref(tx_hello);
+
+        /*
+         *
+         */
+        if(peer){
+                kptllnd_peer_check_sends(peer);
+        }
+
+        if(peer_to_cancel) {
+                kptllnd_peer_cancel(peer_to_cancel);
+                kptllnd_peer_decref(peer_to_cancel,"find");
+        }
+
+        PJK_UT_MSG("<<< Peer=%p\n",peer);
+
+        return peer;
+}
+
+void
+kptllnd_tx_launch (
+        kptl_tx_t *tx,
+        lnet_nid_t target_nid,
+        lnet_msg_t *ptlmsg )
+{
+        kptl_data_t     *kptllnd_data = tx->tx_po.po_kptllnd_data;
+        kptl_peer_t     *peer;
+        unsigned long    flags;
+        rwlock_t        *g_lock = &kptllnd_data->kptl_peer_rw_lock;
+        int              rc;
+        kptl_tx_t       *tx_hello;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+
+        PJK_UT_MSG(">>> TX=%p nid="LPX64"\n",tx,target_nid);
+
+        LASSERT (tx->tx_ptlmsg == NULL);
+        tx->tx_ptlmsg = ptlmsg;              /* finalize ptlmsg on completion */
+
+        LASSERT (tx->tx_peer == NULL);       /* only set when assigned a peer */
+
+
+        /*
+         * First try to find the peer (this will grab the
+         * read lock
+         */
+        peer = kptllnd_peer_find (kptllnd_data,target_nid);
+
+        /*
+         * If we find the peer
+         * then just queue the tx
+         * (which could send it)
+         */
+        if (peer != NULL) {
+                kptllnd_peer_queue_tx ( peer, tx );
+                kptllnd_peer_decref(peer,"find");
+                PJK_UT_MSG("<<< FOUND\n");
+                return;
+        }
+
+
+        /*
+         * Since we didn't find the peer
+         * Setup a HELLO message.  We ultimately might not use it
+         * (in the case that the peer is racing to connect with us)
+         * but more than likely we will.
+         */
+        tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+        if( tx_hello == NULL) {
+                CERROR("Unable to allocate connect message for "LPX64"\n",target_nid);
+                kptllnd_tx_decref (tx);
+                kptllnd_peer_decref(peer,"find");
+                return;
+        }
+
+        kptllnd_init_msg(
+                tx_hello->tx_msg,
+                PTLLND_MSG_TYPE_HELLO,
+                sizeof(kptl_hello_msg_t));
+
+
+        /*
+         * Now try again with the exclusive lock
+         * so if it's not found we'll add it
+         */
+        write_lock_irqsave(g_lock, flags);
+
+        peer = kptllnd_peer_find_locked (kptllnd_data,target_nid);
+
+        /*
+         * If we find the peer
+         * then just queue the tx
+         * (which could send it)
+         */
+        if (peer != NULL) {
+                write_unlock_irqrestore(g_lock, flags);
+
+                CDEBUG(D_TRACE,"HELLO message race occurred (nid="LPX64")\n",target_nid);
+
+                kptllnd_peer_queue_tx ( peer, tx );
+                kptllnd_peer_decref(peer,"find");
+
+                /* and we don't need the connection tx*/
+                kptllnd_tx_decref(tx_hello);
+
+                PJK_UT_MSG("<<< FOUND2\n");
+                return;
+        }
+
+        PJK_UT_MSG("TX %p creating NEW PEER nid="LPX64"\n",tx,target_nid);
+        rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, target_nid);
+        if(rc != 0){
+                CERROR("Failed to create peer (nid="LPX64")\n",target_nid);
+                write_unlock_irqrestore(g_lock, flags);
+                kptllnd_tx_decref (tx);
+                kptllnd_tx_decref (tx_hello);
+                kptllnd_peer_decref(peer,"find");
+                return;
+        }
+
+
+        /*
+         * We've never seen this peer before.  So setup
+         * a default message.
+         */
+        tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits = 0;
+        tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size =
+                *kptllnd_tunables.kptl_max_immd_size;
+
+        /*
+         * Queue the connection request
+         * and the actually tx.  We have one credit so
+         * the connection request will go out, and
+         * the tx will wait for a reply.
+         */
+        PJK_UT_MSG("TXHello=%p\n",tx_hello);
+        kptllnd_peer_queue_tx_locked(peer,tx_hello);
+        kptllnd_peer_queue_tx_locked(peer,tx);
+
+        write_unlock_irqrestore(g_lock,flags);
+
+        kptllnd_peer_check_sends(peer);
+        kptllnd_peer_decref(peer,"find");
+
+        PJK_UT_MSG("<<<\n");
+}
diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c
new file mode 100644 (file)
index 0000000..ea68b86
--- /dev/null
@@ -0,0 +1,940 @@
+#include "ptllnd.h"
+
+kptl_rx_t*
+kptllnd_rx_alloc(
+        kptl_data_t *kptllnd_data );
+
+void
+kptllnd_rx_schedule (kptl_rx_t *rx);
+
+void
+kptllnd_rx_buffer_destroy(
+        kptl_rx_buffer_t *rxb);
+int
+kptllnd_rx_buffer_post(
+        kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_addref(
+        kptl_rx_buffer_t *rxb,
+        const char *owner);
+
+void
+kptllnd_rx_buffer_pool_init(
+        kptl_rx_buffer_pool_t *rxbp)
+{
+        PJK_UT_MSG("kptllnd_rx_buffer_pool_init\n");
+        memset(rxbp,0,sizeof(*rxbp));
+
+        spin_lock_init (&rxbp->rxbp_lock);
+        INIT_LIST_HEAD (&rxbp->rxbp_list);
+
+}
+
+void
+kptllnd_rx_buffer_pool_fini(
+        kptl_rx_buffer_pool_t *rxbp)
+{
+        kptl_rx_buffer_t       *rxb;
+        int                     rc;
+        int                     i;
+
+        PJK_UT_MSG("kptllnd_rx_buffer_pool_fini\n");
+
+        spin_lock(&rxbp->rxbp_lock);
+
+        /*
+         * Set the shutdown flag under the lock
+         */
+        rxbp->rxbp_shutdown = 1;
+
+        i = 2;
+        while(!list_empty(&rxbp->rxbp_list))
+        {
+                struct list_head* iter;
+                int count = 0;
+
+                /*
+                 * Count how many items are on the list right now
+                 */
+                list_for_each(iter,&rxbp->rxbp_list)
+                        ++count;
+
+                CDEBUG(D_TRACE,"|rxbp_list|=%d\n",count);
+
+                /*
+                 * Loop while we still have items on the list
+                 * ore we've going through the list once
+                 */
+                while(!list_empty(&rxbp->rxbp_list) && count!=0)
+                {
+                        --count;
+                        rxb = list_entry (rxbp->rxbp_list.next,
+                                                 kptl_rx_buffer_t, rxb_list);
+
+                        LASSERT(rxb->rxb_state == RXB_STATE_POSTED);
+
+
+                        list_del_init(&rxb->rxb_list);
+
+                        /*
+                         * We have hit the one race where the MD has been put
+                         * on the list, but the MD is not created.
+                         */
+                        if(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE)){
+                                list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+                                continue;
+                        }
+
+
+                        /*
+                         * Keep the RXB from being deleted
+                         */
+                        kptllnd_rx_buffer_addref(rxb,"temp");
+
+                        spin_unlock(&rxbp->rxbp_lock);
+
+                        /*
+                         * Unlinked the MD
+                         */
+                        LASSERT(atomic_read(&rxb->rxb_refcount)>1);
+                        rc = PtlMDUnlink(rxb->rxb_mdh);
+                        if(rc == 0){
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+                                kptllnd_rx_buffer_decref(rxb,"portals");
+#endif
+                                /*
+                                 * Drop the reference we took above
+                                 */
+                                kptllnd_rx_buffer_decref(rxb,"temp");
+
+                                spin_lock(&rxbp->rxbp_lock);
+                        }else{
+                                PJK_UT_MSG("PtlMDUnlink(%p) rc=%d\n",rxb,rc);
+                                /*
+                                 * The unlinked failed so put this back
+                                 * on the list for later
+                                 */
+                                spin_lock(&rxbp->rxbp_lock);
+
+                                list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+
+                                /*
+                                 * Drop the reference we took above
+                                 */
+                                kptllnd_rx_buffer_decref(rxb,"temp");
+                        }
+                }
+
+                /*
+                 * If there are still items on the list we
+                 * need to take a break, and let the Busy RX's
+                 * finish up.
+                 */
+                if(!list_empty(&rxbp->rxbp_list)){
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d Busy RX Buffers\n",
+                               rxbp->rxbp_count);
+                        spin_unlock(&rxbp->rxbp_lock);
+                        cfs_pause(cfs_time_seconds(1));
+                        spin_lock(&rxbp->rxbp_lock);
+                }
+        }
+
+        CDEBUG(D_TRACE,"|rxbp_list|=EMPTY\n");
+
+        if(rxbp->rxbp_count != 0){
+                PJK_UT_MSG("Waiting for %d RX Buffers to unlink\n",rxbp->rxbp_count);
+
+                i = 2;
+                while (rxbp->rxbp_count != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d RX Buffers to unlink\n",
+                               rxbp->rxbp_count);
+                        spin_unlock(&rxbp->rxbp_lock);
+                        cfs_pause(cfs_time_seconds(1));
+                        spin_lock(&rxbp->rxbp_lock);
+                }
+        }
+
+        CDEBUG(D_TRACE,"|rxbp_count|=0\n");
+
+        spin_unlock(&rxbp->rxbp_lock);
+}
+
+
+int
+kptllnd_rx_buffer_pool_reserve(
+        kptl_rx_buffer_pool_t *rxbp,
+        kptl_data_t *kptllnd_data,
+        int count)
+{
+        int                     add = 0;
+        int                     i;
+        int                     rc;
+        kptl_rx_buffer_t       *rxb;
+        int                     nbuffers;
+
+        spin_lock(&rxbp->rxbp_lock);
+
+        PJK_UT_MSG("kptllnd_rx_buffer_pool_reserve(%d)\n",count);
+
+        /*
+         * Prevent reservation of anymore while we are shutting down
+         */
+        if(rxbp->rxbp_shutdown){
+                spin_unlock(&rxbp->rxbp_lock);
+                return -ESHUTDOWN;
+        }
+
+        /*
+         * Make the reservation
+         */
+        rxbp->rxbp_reserved += count;
+
+        /*
+         * Calcuate the number or buffers we need
+         * +1 to handle any rounding error
+         */
+        nbuffers = (rxbp->rxbp_reserved) *
+                (*kptllnd_tunables.kptl_max_immd_size) /
+                (PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages));
+        ++nbuffers ;
+
+        PJK_UT_MSG("nbuffers=%d rxbp_count=%d\n",nbuffers,rxbp->rxbp_count);
+
+        if(rxbp->rxbp_count < nbuffers)
+                add = nbuffers - rxbp->rxbp_count;
+
+        PJK_UT_MSG("adding=%d\n",add);
+
+        /*
+         * Under the same lock assume they are added
+         * we'll subtract if we hit an error.
+         */
+        rxbp->rxbp_count += add;
+        spin_unlock(&rxbp->rxbp_lock);
+
+        for(i=0;i<add;i++){
+                PORTAL_ALLOC( rxb,sizeof(*rxb));
+                if(rxb == NULL){
+                        CERROR("Failed to allocate data rxb%d\n",i);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+
+                memset(rxb,0,sizeof(*rxb));
+
+                kptllnd_posted_object_setup(&rxb->rxb_po,
+                          kptllnd_data,
+                          POSTED_OBJECT_TYPE_RXB);
+
+                rxb->rxb_pool = rxbp;
+                rxb->rxb_state = RXB_STATE_IDLE;
+                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+                INIT_LIST_HEAD (&rxb->rxb_list);
+                INIT_LIST_HEAD (&rxb->rxb_repost_list);
+
+                PORTAL_ALLOC( rxb->rxb_buffer,
+                        PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+                if(rxb->rxb_buffer == NULL) {
+                        CERROR("Failed to allocate data buffer or size %d pages for rx%d\n",
+                                *kptllnd_tunables.kptl_rxb_npages,i);
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+
+                rc = kptllnd_rx_buffer_post(rxb);
+                if(rc != 0)
+                        goto failed;
+        }
+        return 0;
+
+failed:
+        spin_lock(&rxbp->rxbp_lock);
+
+        /*
+         * We really didn't add as many
+         * as we were planning to.
+         */
+        rxbp->rxbp_count -= add - i;
+
+        /*
+         * Cancel this reservation
+         */
+        rxbp->rxbp_reserved -= count;
+        spin_unlock(&rxbp->rxbp_lock);
+
+
+        if(rxb){
+                if(rxb->rxb_buffer)
+                        PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+                PORTAL_FREE( rxb,sizeof(*rxb));
+        }
+
+        return rc;
+}
+
+void
+kptllnd_rx_buffer_pool_unreserve(
+        kptl_rx_buffer_pool_t *rxbp,
+        int count)
+{
+        spin_lock(&rxbp->rxbp_lock);
+        PJK_UT_MSG("kptllnd_rx_buffer_pool_unreserve(%d)\n",count);
+        rxbp->rxbp_reserved -= count;
+        spin_unlock(&rxbp->rxbp_lock);
+}
+
+void
+kptllnd_rx_buffer_scheduled_post(
+        kptl_rx_buffer_t *rxb)
+{
+        kptl_data_t     *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+        unsigned long    flags;
+
+        PJK_UT_MSG("rxb=%p\n",rxb);
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+        LASSERT(list_empty(&rxb->rxb_repost_list));
+        list_add_tail(&rxb->rxb_repost_list,&kptllnd_data->kptl_sched_rxbq);
+        wake_up(&kptllnd_data->kptl_sched_waitq);
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+
+int
+kptllnd_rx_buffer_post(
+        kptl_rx_buffer_t *rxb)
+{
+        int                     rc;
+        ptl_md_t                md;
+        ptl_handle_me_t         meh;
+        ptl_handle_md_t         mdh;
+        ptl_process_id_t        any;
+        kptl_data_t            *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+        kptl_rx_buffer_pool_t  *rxbp = rxb->rxb_pool;
+
+        any.nid = PTL_NID_ANY;
+        any.pid = PTL_PID_ANY;
+
+        /*PJK_UT_MSG("rxb=%p\n",rxb);*/
+
+        spin_lock(&rxbp->rxbp_lock);
+
+        /*
+         * No new RXB's can enter the POSTED state
+         */
+        if(rxbp->rxbp_shutdown){
+                spin_unlock(&rxbp->rxbp_lock);
+                return -ESHUTDOWN;
+        }
+
+        LASSERT(!in_interrupt());
+
+        LASSERT(rxb->rxb_state == RXB_STATE_IDLE);
+        LASSERT(atomic_read(&rxb->rxb_refcount)==0);
+        LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE));
+
+        list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+        atomic_set(&rxb->rxb_refcount,1);
+        rxb->rxb_state = RXB_STATE_POSTED;
+
+        spin_unlock(&rxbp->rxbp_lock);
+
+        /*
+         * Attach the ME
+         */
+        rc = PtlMEAttach(
+            kptllnd_data->kptl_nih,
+            *kptllnd_tunables.kptl_portal,
+            any,
+            LNET_MSG_MATCHBITS,
+            0, /* all matchbits are valid - ignore none*/
+            PTL_UNLINK,
+            PTL_INS_AFTER,
+            &meh);
+        if(rc != 0) {
+                CERROR("PtlMeAttach rxb failed %d\n",rc);
+                goto failure;
+        }
+
+        /*
+         * Setup MD
+         */
+        md.start = rxb->rxb_buffer;
+        md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages;
+        md.threshold = PTL_MD_THRESH_INF;
+        md.options = PTL_MD_OP_PUT;
+        md.options |= PTL_MD_EVENT_START_DISABLE;
+        md.options |= PTL_MD_MAX_SIZE;
+        md.user_ptr = rxb;
+        md.max_size = *kptllnd_tunables.kptl_max_immd_size;
+        md.eq_handle = kptllnd_data->kptl_eqh;
+
+
+        /*
+         * Attach the MD
+         */
+        rc = PtlMDAttach(
+                meh,
+                md,
+                PTL_UNLINK,
+                &mdh);
+        if(rc != 0){
+                int rc2;
+                CERROR("PtlMDAttach rxb failed %d\n",rc);
+                rc2 = PtlMEUnlink(meh);
+                LASSERT(rc2 == 0);
+                goto failure;
+        }
+
+        /*
+         * Assign the MDH under the lock
+         * to deal with shutdown race, of
+         * a partially constructed rbx
+         */
+        spin_lock(&rxbp->rxbp_lock);
+        rxb->rxb_mdh = mdh;
+        spin_unlock(&rxbp->rxbp_lock);
+
+        return 0;
+
+
+failure:
+        /*
+         * Cleanup on error
+         */
+        spin_lock(&rxbp->rxbp_lock);
+        list_del_init(&rxb->rxb_list);
+        atomic_set(&rxb->rxb_refcount,0);
+        rxb->rxb_state = RXB_STATE_IDLE;
+        spin_unlock(&rxbp->rxbp_lock);
+
+        return rc;
+}
+
+void
+kptllnd_rx_buffer_post_handle_error(
+        kptl_rx_buffer_t *rxb)
+{
+        int rc;
+        rc = kptllnd_rx_buffer_post(rxb);
+        if(rc!=0){
+                /* Don't log on shutdown */
+                if(rc != -ESHUTDOWN)
+                        CERROR("Failing to Repost buffer rc=%d\n",rc);
+
+                kptllnd_rx_buffer_destroy(rxb);
+                /* Should I destroy the peer?
+                 * I don't think so.  But this now
+                 * now means there is some chance
+                 * under very heavy load that we will drop a packet.
+                 * On the other hand, if there is more buffers in
+                 * the pool that are reserved this won't happen.
+                 * And secondly under heavly load it is liklye a
+                 * a new peer will be added added, the reservation
+                 * for the ones that were lost will
+                 * get new backing buffers at that time.
+                 *
+                 * So things are starting to get bad, but
+                 * in all likelihood things will be fine,
+                 * and even better they might correct themselves
+                 * in time.
+                 */
+        }
+}
+
+void
+kptllnd_rx_buffer_destroy(
+        kptl_rx_buffer_t *rxb)
+{
+        kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+
+        LASSERT(atomic_read(&rxb->rxb_refcount) == 0);
+        LASSERT(rxb->rxb_state == RXB_STATE_IDLE);
+        LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE));
+
+        spin_lock(&rxbp->rxbp_lock);
+        list_del(&rxb->rxb_list);
+        rxbp->rxbp_count--;
+        spin_unlock(&rxbp->rxbp_lock);
+
+        PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+        PORTAL_FREE(rxb,sizeof(*rxb));
+}
+
+
+
+void
+kptllnd_rx_buffer_callback(ptl_event_t *ev)
+{
+        kptl_rx_buffer_t *rxb = ev->md.user_ptr;
+        kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+        /*kptl_data_t  *kptllnd_data = rxb->rxb_po.po_kptllnd_data;*/
+        kptl_rx_t *rx;
+        int nob;
+        int unlinked;
+
+        /*
+         * Set the local unlinked flag
+         */
+        unlinked = ev->type == PTL_EVENT_UNLINK;
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        if( ev->unlinked )
+                unlinked = 1;
+#endif
+
+        if(!rxbp->rxbp_shutdown){
+                PJK_UT_MSG("RXB Callback %s(%d) rxb=%p nid="LPX64" unlink=%d\n",
+                        get_ev_type_string(ev->type),ev->type,
+                        rxb,ev->initiator.nid,unlinked);
+        }
+
+        LASSERT( ev->md.start == rxb->rxb_buffer);
+        LASSERT( ev->offset + ev->mlength <= PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+        LASSERT( ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK);
+        LASSERT( ev->match_bits == LNET_MSG_MATCHBITS);
+
+        CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR,
+               "event type %d, status %d from "LPX64"\n",
+               ev->type, ev->ni_fail_type,ev->initiator.nid);
+
+        nob = ev->mlength;
+
+        if(unlinked){
+                spin_lock(&rxbp->rxbp_lock);
+
+                /*
+                 * Remove this from the list
+                 */
+                list_del_init(&rxb->rxb_list);
+
+                LASSERT(rxb->rxb_state == RXB_STATE_POSTED);
+                rxb->rxb_state = RXB_STATE_IDLE;
+                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+
+                if( rxbp->rxbp_shutdown){
+                        spin_unlock(&rxbp->rxbp_lock);
+                        kptllnd_rx_buffer_decref(rxb,"portals");
+                        return;
+                }
+
+                spin_unlock(&rxbp->rxbp_lock);
+
+
+        }
+
+        /*
+         * Handle failure by just dropping the path
+         */
+        if(ev->ni_fail_type != PTL_NI_OK){
+                CERROR("Message Dropped: ev status %d",ev->ni_fail_type);
+                if(unlinked)
+                        kptllnd_rx_buffer_scheduled_post(rxb);
+                return;
+        }
+
+        /*
+         * Allocate an RX
+         */
+        rx = kptllnd_rx_alloc(rxb->rxb_po.po_kptllnd_data);
+        if(rx == 0){
+                CERROR("Message Dropped: Memory allocation failure");
+                if(unlinked)
+                        kptllnd_rx_buffer_scheduled_post(rxb);
+                return;
+        }
+
+        PJK_UT_MSG_DATA("New RX=%p\n",rx);
+
+        /*
+         * If we are unlinked we can just transfer the ref
+         * that portals owned to the ref that this RX owns
+         * otherwise we need to add a ref specifically for this RX
+         */
+        if(!unlinked)
+                kptllnd_rx_buffer_addref(rxb,"rx");
+
+        rx->rx_msg = rxb->rxb_buffer + ev->offset;
+        rx->rx_rxb = rxb;
+        rx->rx_nob = nob;
+#ifdef TESTING_WITH_LOOPBACK
+        /*
+         * When testing with loopback on socknal
+         * packets are received on loopback NAL so
+         * until I figure out how to do that properly
+         * just make it look like it came from this NID
+         */
+        rx->rx_initiator = rxb->rxb_po.po_kptllnd_data->kptl_portals_id;
+#else
+        rx->rx_initiator = ev->initiator;
+#endif
+
+        kptllnd_rx_schedule(rx);
+
+        if(!rxbp->rxbp_shutdown){
+                PJK_UT_MSG("<<< rx=%p rxb=%p\n",rx,rxb);
+        }
+}
+
+
+void
+kptllnd_rx_schedule (kptl_rx_t *rx)
+{
+        unsigned long    flags;
+        kptl_data_t  *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data;
+
+        CDEBUG(D_NET, "rx\n");
+
+        PJK_UT_MSG("RX Schedule %p\n",rx);
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+        list_add_tail(&rx->rx_list,&kptllnd_data->kptl_sched_rxq);
+        wake_up(&kptllnd_data->kptl_sched_waitq);
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+
+void
+kptllnd_rx_scheduler_handler(kptl_rx_t *rx)
+{
+        int                     rc;
+        kptl_rx_buffer_t       *rxb = rx->rx_rxb;
+        kptl_msg_t             *msg = rx->rx_msg;
+        kptl_data_t            *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+        kptl_peer_t            *peer = NULL;
+        int                     returned_credits = 0;
+        int                     type = msg->ptlm_type;
+        lnet_nid_t              lnet_initiator_nid = ptl2lnetnid(kptllnd_data,rx->rx_initiator.nid);
+
+
+        PJK_UT_MSG_DATA(">>> RXRXRXRXRXRXRXRXRXRXRXRX\n");
+        PJK_UT_MSG_DATA("rx=%p nob=%d\n",rx,rx->rx_nob);
+
+        /*
+         * If the nob==0 then silently discard this message
+         */
+        if(rx->rx_nob == 0)
+                goto exit;
+
+        rc = kptllnd_msg_unpack(msg, rx->rx_nob, kptllnd_data);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from "LPX64"\n",
+                        rc, rx->rx_initiator.nid);
+                goto exit;
+        }
+
+        PJK_UT_MSG_DATA("RX=%p Type=%s(%d)\n",rx,
+                get_msg_type_string(type),type);
+        PJK_UT_MSG_DATA("Msg NOB = %d\n",msg->ptlm_nob);
+        PJK_UT_MSG_DATA("Returned Credits=%d\n",msg->ptlm_credits);
+        PJK_UT_MSG_DATA("Seq # ="LPX64"\n",msg->ptlm_seq);
+        PJK_UT_MSG_DATA("lnet RX nid=" LPX64 "\n",lnet_initiator_nid);
+        PJK_UT_MSG("ptl  RX nid=" LPX64 "\n",rx->rx_initiator.nid);
+
+        if(type == PTLLND_MSG_TYPE_HELLO)
+        {
+                peer = kptllnd_peer_handle_hello(
+                        kptllnd_data,
+                        lnet_initiator_nid,
+                        msg);
+                if( peer == NULL){
+                        CERROR ("Failed to create peer for "LPX64"\n",
+                                lnet_initiator_nid);
+                        goto exit;
+                }
+
+                if (!( msg->ptlm_dststamp == kptllnd_data->kptl_incarnation ||
+                       msg->ptlm_dststamp == 0)) {
+                        CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+                                peer->peer_nid,
+                                msg->ptlm_dststamp,
+                                kptllnd_data->kptl_incarnation );
+                        goto exit;
+                }
+        }
+        else
+        {
+                peer = kptllnd_peer_find(kptllnd_data,lnet_initiator_nid);
+                if( peer == NULL){
+                        CERROR ("No connection with "LPX64"\n",
+                                lnet_initiator_nid);
+                        goto exit;
+                }
+
+                if (msg->ptlm_dststamp != kptllnd_data->kptl_incarnation) {
+                        CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+                                peer->peer_nid,
+                                msg->ptlm_dststamp,
+                                kptllnd_data->kptl_incarnation );
+                        goto exit;
+                }
+        }
+
+        if( msg->ptlm_srcnid != peer->peer_nid){
+                CERROR ("Stale rx srcnid "LPX64" expected "LPX64"\n",
+                        msg->ptlm_srcnid,
+                        peer->peer_nid );
+                goto exit;
+        }
+        if( msg->ptlm_srcstamp != peer->peer_incarnation){
+                CERROR ("Stale rx from "LPX64" srcstamp"LPX64" expected "LPX64"\n",
+                        peer->peer_nid,
+                        msg->ptlm_srcstamp,
+                        peer->peer_incarnation );
+                goto exit;
+        }
+        if( msg->ptlm_dstnid != kptllnd_data->kptl_ni->ni_nid){
+                CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+                        peer->peer_nid,
+                        msg->ptlm_dstnid,
+                        kptllnd_data->kptl_ni->ni_nid );
+                goto exit;
+        }
+
+        /*
+         * Save the number of credits
+         */
+        returned_credits = msg->ptlm_credits;
+
+        /*
+         * Attach the peer to the RX
+         * it now is responsibly for releaseing the refrence
+         */
+        rx->rx_peer = peer;
+        peer = 0;
+
+        /*
+         * Note: We are explicitly ignore sequence #
+         * It is informational only
+         */
+        switch (msg->ptlm_type) {
+        default:
+                CERROR("Bad PTL message type %x from "LPX64"\n",
+                       msg->ptlm_type, rx->rx_peer->peer_nid);
+                break;
+
+        case PTLLND_MSG_TYPE_HELLO:
+                PJK_UT_MSG("PTLLND_MSG_TYPE_HELLO\n");
+                break;
+
+        case PTLLND_MSG_TYPE_NOOP:
+                PJK_UT_MSG("PTLLND_MSG_TYPE_NOOP\n");
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n");
+                rc = lnet_parse(kptllnd_data->kptl_ni,
+                        &msg->ptlm_u.immediate.kptlim_hdr,
+                        msg->ptlm_srcnid,
+                        rx);
+                /* RX Completing asynchronously */
+                if( rc >= 0)
+                        rx = 0;
+                break;
+
+        case PLTLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n",
+                        msg->ptlm_type == PLTLND_MSG_TYPE_PUT ?
+                        "PUT" : "GET");
+
+                /*
+                 * Save the last match bits used
+                 */
+                spin_lock(&rx->rx_peer->peer_lock);
+                if(msg->ptlm_u.req.kptlrm_matchbits > rx->rx_peer->peer_last_matchbits_seen)
+                        rx->rx_peer->peer_last_matchbits_seen = msg->ptlm_u.req.kptlrm_matchbits;
+                spin_unlock(&rx->rx_peer->peer_lock);
+
+                rc = lnet_parse(kptllnd_data->kptl_ni,
+                        &msg->ptlm_u.req.kptlrm_hdr,
+                        msg->ptlm_srcnid,
+                        rx);
+
+                /* RX Completing asynchronously */
+                if( rc >= 0)
+                        rx = 0;
+                break;
+         }
+
+
+        CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
+                type, returned_credits, peer->peer_nid);
+
+exit:
+        /* PEER == NULL if it is not yet assigned or already
+         * been attached to RX */
+        if(peer)
+                kptllnd_peer_decref(peer,"lookup");
+
+        /* RX == NULL if it is completing asynchronously */
+        if(rx)
+                kptllnd_rx_decref(rx,"sched",kptllnd_data);
+
+        PJK_UT_MSG_DATA("<<< RXRXRXRXRXRXRXRXRXRXRXRX rx=%p\n",rx);
+        return;
+}
+
+void
+kptllnd_rx_buffer_addref(
+        kptl_rx_buffer_t *rxb,
+        const char *owner)
+{
+        atomic_inc(&rxb->rxb_refcount);
+
+#if 0
+        /*
+         * The below message could actually be out of sync
+         * with the real ref count, and is for informational purposes
+         * only
+         */
+        PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
+                atomic_read(&rxb->rxb_refcount));
+#endif
+}
+
+void
+kptllnd_rx_buffer_decref(
+        kptl_rx_buffer_t *rxb,
+        const char *owner)
+{
+        if( !atomic_dec_and_test (&rxb->rxb_refcount)){
+
+#if 0
+                /*
+                 * The below message could actually be out of sync
+                 * with the real ref count, and is for informational purposes
+                 * only
+                 */
+                PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
+                        atomic_read(&rxb->rxb_refcount));
+#endif
+                return;
+        }
+
+#if 0
+        PJK_UT_MSG("rxb=%p owner=%s LAST REF reposting\n",rxb,owner);
+#endif
+
+        kptllnd_rx_buffer_post_handle_error(rxb);
+}
+
+kptl_rx_t*
+kptllnd_rx_alloc(
+        kptl_data_t *kptllnd_data )
+{
+        kptl_rx_t* rx;
+
+        if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_RX_ALLOC )){
+                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
+                CERROR ("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
+                STAT_UPDATE(kps_rx_allocation_failed);
+                return 0;
+        }
+
+        rx = cfs_mem_cache_alloc ( kptllnd_data->kptl_rx_cache , GFP_ATOMIC);
+        if(rx == 0 ){
+                CERROR("Failed to allocate rx\n");
+                STAT_UPDATE(kps_rx_allocation_failed);
+
+        }else{
+
+                STAT_UPDATE(kps_rx_allocated);
+
+                memset(rx,0,sizeof(rx));
+
+                CFS_INIT_LIST_HEAD(&rx->rx_list);
+                atomic_set(&rx->rx_refcount,1);
+        }
+
+        return rx;
+}
+
+void
+kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data)
+{
+        kptl_peer_t  *peer = rx->rx_peer;
+        kptl_msg_t   *msg = rx->rx_msg;
+        int returned_credits = msg->ptlm_credits;
+        
+        PJK_UT_MSG(">>> rx=%p\n",rx);
+
+        STAT_UPDATE(kps_rx_released);
+
+        LASSERT(atomic_read(&rx->rx_refcount)==0);
+
+        if(rx->rx_rxb){
+                PJK_UT_MSG("Release rxb=%p\n",rx->rx_rxb);
+                kptllnd_rx_buffer_decref(rx->rx_rxb,"rx");
+                rx->rx_rxb = 0;
+        }else{
+                PJK_UT_MSG("rxb already released\n");
+        }
+
+        if(peer){
+
+                /*
+                 * Update credits
+                 * (Only after I've reposted the buffer)
+                 */
+                spin_lock(&peer->peer_lock);
+                peer->peer_credits += returned_credits;
+                LASSERT( peer->peer_credits <=
+                        *kptllnd_tunables.kptl_peercredits);
+                peer->peer_outstanding_credits++;
+                LASSERT( peer->peer_outstanding_credits <=
+                        *kptllnd_tunables.kptl_peercredits);
+                spin_unlock(&peer->peer_lock);
+
+                PJK_UT_MSG_DATA("Giving Back %d credits rx=%p\n",returned_credits,rx);
+
+                /* Have I received credits that will let me send? */
+                if (returned_credits != 0)
+                        kptllnd_peer_check_sends(peer);
+
+                kptllnd_peer_decref(peer,"lookup");
+        }
+
+        cfs_mem_cache_free(kptllnd_data->kptl_rx_cache,rx);
+
+        PJK_UT_MSG("<<< rx=%p\n",rx);
+}
+
+void
+kptllnd_rx_addref(kptl_rx_t *rx,const char *owner)
+{
+        atomic_inc(&rx->rx_refcount);
+
+        /*
+         * The below message could actually be out of sync
+         * with the real ref count, and is for informational purposes
+         * only
+         */
+        PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
+                atomic_read(&rx->rx_refcount));
+}
+
+void
+kptllnd_rx_decref(kptl_rx_t *rx,const char *owner,kptl_data_t *kptllnd_data)
+{
+        if( !atomic_dec_and_test (&rx->rx_refcount)){
+                /*
+                 * The below message could actually be out of sync
+                 * with the real ref count, and is for informational purposes
+                 * only
+                 */
+                PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
+                        atomic_read(&rx->rx_refcount));
+                return;
+        }
+
+        PJK_UT_MSG("rx=%p owner=%s LAST REF destroying\n",rx,owner);
+
+        kptllnd_rx_destroy(rx,kptllnd_data);
+}
+
diff --git a/lnet/klnds/ptllnd/ptllnd_tx.c b/lnet/klnds/ptllnd/ptllnd_tx.c
new file mode 100644 (file)
index 0000000..e69fdec
--- /dev/null
@@ -0,0 +1,516 @@
+#include "ptllnd.h"
+
+
+void
+kptllnd_tx_schedule (kptl_tx_t *tx);
+
+
+
+int
+kptllnd_setup_tx_descs (kptl_data_t *kptllnd_data)
+{
+        kptl_tx_t       *tx;
+        int             i;
+
+        PJK_UT_MSG("\n");
+
+        /*
+         * First initialize the tx descriptors
+         */
+        memset(kptllnd_data->kptl_tx_descs, 0,
+               PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+
+        for (i = 0; i < PTLLND_TX_MSGS(); i++) {
+                tx = &kptllnd_data->kptl_tx_descs[i];
+
+
+                kptllnd_posted_object_setup(&tx->tx_po,
+                                          kptllnd_data,
+                                          POSTED_OBJECT_TYPE_TX);
+
+                CFS_INIT_LIST_HEAD(&tx->tx_list);
+                CFS_INIT_LIST_HEAD(&tx->tx_schedlist);
+
+                /*
+                 * Determine if this is a regular or reserved descriptor
+                 */
+                tx->tx_isnblk = (i >= *kptllnd_tunables.kptl_ntx);
+
+                /*
+                 * Set the state
+                 */
+                tx->tx_state = TX_STATE_ON_IDLE_QUEUE;
+
+                PORTAL_ALLOC( tx->tx_msg, *kptllnd_tunables.kptl_max_immd_size );
+                if(tx->tx_msg == 0){
+                        CERROR("Failed to allocate TX payload\n");
+                        kptllnd_cleanup_tx_descs(kptllnd_data);
+                }
+
+
+                /*
+                 * Add this to the correct queue
+                 */
+                if (tx->tx_isnblk)
+                        list_add (&tx->tx_list,
+                                  &kptllnd_data->kptl_idle_nblk_txs);
+                else
+                        list_add (&tx->tx_list,
+                                  &kptllnd_data->kptl_idle_txs);
+        }
+
+        return (0);
+}
+
+void
+kptllnd_cleanup_tx_descs(kptl_data_t *kptllnd_data)
+{
+        kptl_tx_t       *tx;
+        int             i;
+
+        PJK_UT_MSG("\n");
+
+        for (i = 0; i < PTLLND_TX_MSGS(); i++) {
+                tx = &kptllnd_data->kptl_tx_descs[i];
+
+
+                /*
+                 * Handle partial initization by stopping
+                 * when we hit one that is not fully initialized
+                 */
+                if( tx->tx_msg == 0 )
+                        break;
+
+                LASSERT( tx->tx_state == TX_STATE_ON_IDLE_QUEUE );
+
+                PORTAL_FREE(tx->tx_msg,*kptllnd_tunables.kptl_max_immd_size);
+        }
+}
+
+kptl_tx_t *
+kptllnd_get_idle_tx(
+        kptl_data_t *kptllnd_data,
+        int may_block,
+        kptl_tx_type_t purpose)
+{
+        kptl_tx_t      *tx = NULL;
+        ENTRY;
+
+        PJK_UT_MSG(">>> may_block=%d purpose=%d\n",may_block,purpose);
+
+        if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_PUT_ALLOC ) && purpose == TX_TYPE_LARGE_PUT){
+                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
+                CERROR ("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
+                tx = NULL;
+                STAT_UPDATE(kpt_tx_allocation_failed);
+                goto exit;
+        }
+        if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_GET_ALLOC ) && purpose == TX_TYPE_LARGE_GET){
+                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
+                CERROR ("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
+                tx = NULL;
+                STAT_UPDATE(kpt_tx_allocation_failed);
+                goto exit;
+        }
+        if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX )){
+                PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX SIMULATION triggered\n");
+                CERROR ("FAIL_BLOCKING_TX SIMULATION triggered\n");
+                tx = NULL;
+                STAT_UPDATE(kpt_tx_allocation_failed);
+                goto exit;
+        }
+
+        while ( !kptllnd_data->kptl_shutdown ) {
+
+                spin_lock(&kptllnd_data->kptl_tx_lock);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kptllnd_data->kptl_idle_txs)) {
+                        tx = list_entry (kptllnd_data->kptl_idle_txs.next,
+                                         kptl_tx_t, tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        /* may dip into reserve pool */
+                        if (list_empty (&kptllnd_data->kptl_idle_nblk_txs)) {
+                                CERROR ("reserved tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        tx = list_entry (kptllnd_data->kptl_idle_nblk_txs.next,
+                                         kptl_tx_t, tx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+                spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+                wait_event (kptllnd_data->kptl_idle_tx_waitq,
+                            !list_empty (&kptllnd_data->kptl_idle_txs) ||
+                            kptllnd_data->kptl_shutdown);
+        }
+
+        if (tx != NULL) {
+
+                /*
+                 * Check the state
+                 */
+                LASSERT(tx->tx_state == TX_STATE_ON_IDLE_QUEUE);
+
+                /*
+                 * Reference is now owned by caller
+                 */
+                LASSERT(atomic_read(&tx->tx_refcount)== 0);
+                atomic_set(&tx->tx_refcount,1);
+
+                /*
+                 * Remove it from the idle queue
+                 */
+                list_del_init (&tx->tx_list);
+
+                /*
+                 * Set the state and type
+                 */
+                tx->tx_state = TX_STATE_ALLOCATED;
+                tx->tx_type = purpose;
+
+                /*
+                 * Initialize the TX descriptor so that cleanup can be
+                 * handled easily even with a partially initialized descriptor
+                 */
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+                tx->tx_ptlmsg = 0;
+                tx->tx_ptlmsg_reply = 0;
+                tx->tx_peer = 0;
+                tx->tx_associated_rx = 0;
+
+                /*
+                 * These must be re-initialized
+                 */
+                tx->tx_status = -EINVAL;
+                tx->tx_seen_send_end    = 0;
+                tx->tx_seen_reply_end   = 0;
+                tx->tx_payload_niov     = 0;
+                tx->tx_payload_iov      = 0;
+                tx->tx_payload_kiov     = 0;
+                tx->tx_payload_offset   = 0;
+                tx->tx_payload_nob      = 0;
+
+                STAT_UPDATE(kps_tx_allocated);
+        }else{
+                STAT_UPDATE(kpt_tx_allocation_failed);
+        }
+
+        spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+exit:
+        PJK_UT_MSG("<<< tx=%p\n",tx);
+
+        RETURN(tx);
+}
+
+void
+kptllnd_tx_done (kptl_tx_t *tx)
+{
+        kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
+        LASSERT (!in_interrupt());
+
+        PJK_UT_MSG(">>> tx=%p\n",tx);
+
+        LASSERT(tx->tx_state != TX_STATE_ON_IDLE_QUEUE);
+        LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
+        LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+        LASSERT(atomic_read(&tx->tx_refcount) == 0);
+        LASSERT(list_empty(&tx->tx_schedlist)); /*not any the scheduler list*/
+
+        if(tx->tx_ptlmsg != 0){
+                PJK_UT_MSG("tx=%p finalize\n",tx);
+                lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg, tx->tx_status);
+                tx->tx_ptlmsg = 0;
+        }
+        if(tx->tx_ptlmsg_reply != 0){
+                PJK_UT_MSG("tx=%p finalize reply\n",tx);
+                lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg_reply, tx->tx_status);
+                tx->tx_ptlmsg_reply = 0;
+        }
+
+        /*
+         * Release the associated RX if there is one
+         */
+        if(tx->tx_associated_rx){
+                PJK_UT_MSG("tx=%p destroy associated rx %p\n",tx,tx->tx_associated_rx);
+                kptllnd_rx_decref(tx->tx_associated_rx,"tx",kptllnd_data);
+                tx->tx_associated_rx =0;
+        }
+
+        /*
+         * Cleanup resources associate with the peer
+         */
+        if(tx->tx_peer){
+                PJK_UT_MSG("tx=%p detach from peer=%p\n",tx,tx->tx_peer);
+                kptllnd_peer_dequeue_tx(tx->tx_peer,tx);
+                kptllnd_peer_decref(tx->tx_peer,"tx");
+                tx->tx_peer = NULL;
+        }
+
+        LASSERT(list_empty(&tx->tx_list)); /* removed from any peer list*/
+
+        /*
+         * state = back on idle queue
+         */
+        tx->tx_state = TX_STATE_ON_IDLE_QUEUE;
+
+        /*
+         * Put this tx descriptor back on the correct idle queue
+         * If this is a "normal" descriptor then somebody might
+         * be waiting so wake them up
+         */
+        spin_lock(&kptllnd_data->kptl_tx_lock);
+
+        if (tx->tx_isnblk) {
+                list_add (&tx->tx_list, &kptllnd_data->kptl_idle_nblk_txs);
+        } else {
+                list_add (&tx->tx_list, &kptllnd_data->kptl_idle_txs);
+                wake_up (&kptllnd_data->kptl_idle_tx_waitq);
+        }
+
+        STAT_UPDATE(kps_tx_released);
+
+        spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+        PJK_UT_MSG("<<< tx=%p\n",tx);
+}
+
+void
+kptllnd_tx_schedule (kptl_tx_t *tx)
+{
+        kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
+        unsigned long    flags;
+
+        PJK_UT_MSG("tx=%p\n",tx);
+
+        spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+        LASSERT(list_empty(&tx->tx_schedlist));
+        list_add_tail(&tx->tx_schedlist,&kptllnd_data->kptl_sched_txq);
+        wake_up(&kptllnd_data->kptl_sched_waitq);
+        spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+void
+kptllnd_tx_callback(ptl_event_t *ev)
+{
+        kptl_tx_t *tx = ev->md.user_ptr;
+        kptl_peer_t *peer;
+        int rc;
+        int do_decref = 0;
+
+        PJK_UT_MSG(">>> %s(%d) tx=%p fail=%d\n",
+                get_ev_type_string(ev->type),ev->type,tx,ev->ni_fail_type);
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        PJK_UT_MSG("ev->unlinked=%d\n",ev->unlinked);
+#endif
+
+        if(ev->type == PTL_EVENT_UNLINK ){
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                /*
+                 * Ignore unlink events if we don't
+                 * have lustre semantics as these only occur
+                 * in one-to-one correspondence with OPXXX_END
+                 * event's and we've already cleaned up in
+                 * those cases.
+                 */
+                PJK_UT_MSG("<<<\n");
+                return;
+#else
+                /*
+                 * Clear the handles
+                 */
+                if(PtlHandleIsEqual(ev->md_handle,tx->tx_mdh))
+                        tx->tx_mdh = PTL_INVALID_HANDLE;
+                else if (PtlHandleIsEqual(ev->md_handle,tx->tx_mdh_msg))
+                        tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+
+                tx->tx_status = -EINVAL;
+                kptllnd_tx_scheduled_decref(tx);
+                PJK_UT_MSG("<<<\n");
+                return;
+#endif
+        }
+
+        LASSERT(tx->tx_peer != NULL);
+        peer = tx->tx_peer;
+
+        spin_lock(&peer->peer_lock);
+
+        /*
+         * Save the status flag
+         */
+        tx->tx_status = ev->ni_fail_type == PTL_NI_OK ? 0 : -EINVAL;
+
+        switch(ev->type)
+        {
+        case PTL_EVENT_SEND_END:
+
+                /*
+                 * Mark that we've seen an SEND END
+                 */
+                tx->tx_seen_send_end = 1;
+
+                switch(tx->tx_type)
+                {
+                default:
+                        LBUG();
+                        break;
+
+                case TX_TYPE_SMALL_MESSAGE:
+                        PJK_UT_MSG("TX_TYPE_SMALL_MESSAGE\n");
+                        LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
+
+                        /*
+                         * Success or failure we are done with the Message MD
+                         */
+                        tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+                        do_decref = 1;
+                        break;
+
+                case TX_TYPE_LARGE_PUT:
+                case TX_TYPE_LARGE_GET:
+                        PJK_UT_MSG("TX_TYPE_LARGE_%s\n",
+                                tx->tx_type == TX_TYPE_LARGE_PUT ?
+                                "PUT" : "GET");
+                        /*
+                         * Success or failure we are done with the Message MD
+                         */
+                        tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+
+                        /*
+                         * There was an error, and we're not going to make any more
+                         *    progress (obviously) and the
+                         *    PUT_END or GET_END is never going to come.
+                         */
+                        if(ev->ni_fail_type != PTL_NI_OK ){
+
+                                /*
+                                 * There was a error in the message
+                                 * we can safely unlink the MD
+                                 *
+                                 */
+                                if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+                                        LASSERT(atomic_read(&tx->tx_refcount)>1);
+                                        rc = PtlMDUnlink(tx->tx_mdh);
+                                        LASSERT(rc == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                                        tx->tx_mdh = PTL_INVALID_HANDLE;
+                                        /*
+                                         * We are holding another reference
+                                         * so this is not going to do anything
+                                         * but decrement the tx->ref_count
+                                         */
+                                        kptllnd_tx_decref(tx);
+#endif
+                                }
+                        }
+
+                        do_decref = 1;
+                        break;
+
+                case TX_TYPE_LARGE_PUT_RESPONSE:
+                        PJK_UT_MSG("TX_TYPE_LARGE_PUT_RESPONSE\n");
+                        LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+
+                        /*
+                         * If'we've already seen the reply end
+                         * or if this is a failure and we're NEVER going
+                         * to see the reply end, release our reference here
+                         */
+                        if(tx->tx_seen_reply_end || ev->ni_fail_type != PTL_NI_OK){
+                                tx->tx_mdh = PTL_INVALID_HANDLE;
+                                do_decref = 1;
+                        }
+                        break;
+
+                case TX_TYPE_LARGE_GET_RESPONSE:
+                        PJK_UT_MSG("TX_TYPE_LARGE_GET_RESPONSE\n");
+                        LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+
+                        /*
+                         * Success or failure we are done with the MD
+                         */
+                        tx->tx_mdh = PTL_INVALID_HANDLE;
+                        do_decref = 1;
+                        break;
+                }
+                break;
+
+        case PTL_EVENT_GET_END:
+                LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT);
+                LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                do_decref = 1;
+                break;
+        case PTL_EVENT_PUT_END:
+                LASSERT(tx->tx_type == TX_TYPE_LARGE_GET);
+                LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+                tx->tx_mdh = PTL_INVALID_HANDLE;
+                do_decref = 1;
+                break;
+        case PTL_EVENT_REPLY_END:
+                LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE);
+                LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+                tx->tx_seen_reply_end = 1;
+                if(tx->tx_seen_send_end){
+                        tx->tx_mdh = PTL_INVALID_HANDLE;
+                        do_decref = 1;
+                }
+                break;
+        default:
+                LBUG();
+        }
+
+        spin_unlock(&peer->peer_lock);
+
+        if(do_decref)
+                kptllnd_tx_scheduled_decref(tx);
+        PJK_UT_MSG("<<< decref=%d\n",do_decref);
+}
+
+void
+kptllnd_tx_addref(
+        kptl_tx_t *tx)
+{
+        atomic_inc(&tx->tx_refcount);
+}
+
+void
+kptllnd_tx_decref(
+        kptl_tx_t *tx)
+{
+        if( !atomic_dec_and_test(&tx->tx_refcount)){
+                return;
+        }
+
+        PJK_UT_MSG("tx=%p LAST REF\n",tx);
+        kptllnd_tx_done(tx);
+}
+
+void
+kptllnd_tx_scheduled_decref(
+        kptl_tx_t *tx)
+{
+        if( !atomic_dec_and_test(&tx->tx_refcount)){
+                /*
+                 * The below message could actually be out of sync
+                 * with the real ref count, and is for informational purposes
+                 * only
+                 */
+                PJK_UT_MSG("tx=%p count=%d\n",tx,
+                        atomic_read(&tx->tx_refcount));
+                return;
+        }
+
+        PJK_UT_MSG("tx=%p LAST REF\n",tx);
+        kptllnd_tx_schedule(tx);
+}
diff --git a/lnet/klnds/ptllnd/ptllnd_wire.h b/lnet/klnds/ptllnd/ptllnd_wire.h
new file mode 100644 (file)
index 0000000..3b0df56
--- /dev/null
@@ -0,0 +1,56 @@
+/************************************************************************
+ * Portal NAL Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct
+{
+        lnet_hdr_t        kptlim_hdr;             /* portals header */
+        char              kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+        lnet_hdr_t        kptlrm_hdr;             /* portals header */
+        __u64             kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_request_msg_t;
+
+typedef struct
+{
+        __u64             kptlhm_matchbits;       /* matchbits */
+        __u32             kptlhm_max_immd_size;   /* immd message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct kptl_msg
+{
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32           ptlm_magic;     /* I'm an ptl NAL message */
+        __u16           ptlm_version;   /* this is my version number */
+        __u8            ptlm_type;      /* the message type */
+        __u8            ptlm_credits;   /* returned credits */
+        __u32           ptlm_nob;       /* # bytes in whole message */
+        __u32           ptlm_cksum;     /* checksum (0 == no checksum) */
+        __u64           ptlm_srcnid;    /* sender's NID */
+        __u64           ptlm_srcstamp;  /* sender's incarnation */
+        __u64           ptlm_dstnid;    /* destination's NID */
+        __u64           ptlm_dststamp;  /* destination's incarnation */
+        __u64           ptlm_seq;       /* sequence number */
+
+         union {
+                kptl_immediate_msg_t    immediate;
+                kptl_request_msg_t      req;
+                kptl_hello_msg_t        hello;
+        } WIRE_ATTR ptlm_u;
+
+}kptl_msg_t;
+
+#define PTLLND_MSG_MAGIC                0x50746C4E  /* 'PtlN' unique magic */
+#define PTLLND_MSG_VERSION              0x01
+
+#define PTLLND_MSG_TYPE_INVALID         0x00
+#define PLTLND_MSG_TYPE_PUT             0x01
+#define PTLLND_MSG_TYPE_GET             0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP            0x04
+#define PTLLND_MSG_TYPE_HELLO           0x05
+
diff --git a/lnet/klnds/ptllnd/wirecheck.c b/lnet/klnds/ptllnd/wirecheck.c
new file mode 100644 (file)
index 0000000..e40a043
--- /dev/null
@@ -0,0 +1,183 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <portals/api-support.h>
+
+/* This ghastly hack to allows me to include lib-types.h It doesn't affect any
+ * assertions generated here (but fails-safe if it ever does) */
+typedef struct {
+        int     counter;
+} atomic_t;
+
+#include <portals/lib-types.h>
+
+#include "ptlnal_wire.h"
+
+#ifndef HAVE_STRNLEN
+#define strnlen(s, i) strlen(s)
+#endif
+
+#define BLANK_LINE()                            \
+do {                                            \
+        printf ("\n");                          \
+} while (0)
+
+#define COMMENT(c)                              \
+do {                                            \
+        printf ("        /* "c" */\n");         \
+} while (0)
+
+#undef STRINGIFY
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a)                                         \
+do {                                                            \
+        printf ("        CLASSERT ("#a" == "STRINGIFY(a)");\n"); \
+} while (0)
+
+#define CHECK_VALUE(a)                                  \
+do {                                                    \
+        printf ("        CLASSERT ("#a" == %d);\n", a);  \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m)                \
+do {                                            \
+        CHECK_VALUE((int)offsetof(s, m));       \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m)                \
+do {                                            \
+        CHECK_VALUE((int)sizeof(((s *)0)->m));  \
+} while (0)
+
+#define CHECK_MEMBER(s,m)                       \
+do {                                            \
+        CHECK_MEMBER_OFFSET(s, m);              \
+        CHECK_MEMBER_SIZEOF(s, m);              \
+} while (0)
+
+#define CHECK_STRUCT(s)                         \
+do {                                            \
+        BLANK_LINE ();                          \
+        COMMENT ("Checks for struct "#s);       \
+        CHECK_VALUE((int)sizeof(s));            \
+} while (0)
+
+void
+system_string (char *cmdline, char *str, int len)
+{
+        int   fds[2];
+        int   rc;
+        pid_t pid;
+
+        rc = pipe (fds);
+        if (rc != 0)
+                abort ();
+
+        pid = fork ();
+        if (pid == 0) {
+                /* child */
+                int   fd = fileno(stdout);
+
+                rc = dup2(fds[1], fd);
+                if (rc != fd)
+                        abort();
+
+                exit(system(cmdline));
+                /* notreached */
+        } else if ((int)pid < 0) {
+                abort();
+        } else {
+                FILE *f = fdopen (fds[0], "r");
+
+                if (f == NULL)
+                        abort();
+
+                close(fds[1]);
+
+                if (fgets(str, len, f) == NULL)
+                        abort();
+
+                if (waitpid(pid, &rc, 0) != pid)
+                        abort();
+
+                if (!WIFEXITED(rc) ||
+                    WEXITSTATUS(rc) != 0)
+                        abort();
+
+                if (strnlen(str, len) == len)
+                        str[len - 1] = 0;
+
+                if (str[strlen(str) - 1] == '\n')
+                        str[strlen(str) - 1] = 0;
+
+                fclose(f);
+        }
+}
+
+int
+main (int argc, char **argv)
+{
+        char unameinfo[80];
+        char gccinfo[80];
+
+        system_string("uname -a", unameinfo, sizeof(unameinfo));
+        system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo));
+
+        printf ("void vibnal_assert_wire_constants (void)\n"
+                "{\n"
+                "        /* Wire protocol assertions generated by 'wirecheck'\n"
+                "         * running on %s\n"
+                "         * with %s */\n"
+                "\n", unameinfo, gccinfo);
+
+        BLANK_LINE ();
+
+        COMMENT ("Constants...");
+        CHECK_DEFINE (PTLLND_MSG_MAGIC);
+        CHECK_DEFINE (PTLLND_MSG_VERSION);
+
+        CHECK_DEFINE (PTLLND_MSG_TYPE_INVALID);
+        CHECK_DEFINE (PLTLND_MSG_TYPE_PUT);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_GET);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_IMMEDIATE);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_NOOP);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_HELLO);
+
+        CHECK_STRUCT (kptl_msg_t);
+        CHECK_MEMBER (kptl_msg_t, ptlm_magic);
+        CHECK_MEMBER (kptl_msg_t, ptlm_version);
+        CHECK_MEMBER (kptl_msg_t, ptlm_type);
+        CHECK_MEMBER (kptl_msg_t, ptlm_credits);
+        CHECK_MEMBER (kptl_msg_t, ptlm_nob);
+        CHECK_MEMBER (kptl_msg_t, ptlm_cksum);
+        CHECK_MEMBER (kptl_msg_t, ptlm_srcnid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_srcstamp);
+        CHECK_MEMBER (kptl_msg_t, ptlm_dstnid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_dststamp);
+        CHECK_MEMBER (kptl_msg_t, ptlm_seq);
+        CHECK_MEMBER (kib_msg_t, ptlm_u.immediate);
+        CHECK_MEMBER (kib_msg_t, ptlm_u.req);
+        CHECK_MEMBER (kib_msg_t, ptlm_u.hello);
+
+        CHECK_STRUCT (kptl_immediate_msg_t);
+        CHECK_MEMBER (kptl_immediate_msg_t, kptlim_hdr);
+        CHECK_MEMBER (kptl_immediate_msg_t, kptlim_payload[13]);
+
+        CHECK_STRUCT (kptl_request_msg_t);
+        CHECK_MEMBER (kptl_request_msg_t, kptlrm_hdr);
+        CHECK_MEMBER (kptl_request_msg_t, kptlrm_matchbits);
+
+        CHECK_STRUCT (kptl_hello_msg_t);
+        CHECK_MEMBER (kptl_hello_msg_t, kptlhm_matchbits);
+        CHECK_MEMBER (kptl_hello_msg_t, kptlhm_max_immd_size);
+
+        printf ("}\n\n");
+
+        return (0);
+}
index 1f0c9fd..e12f75c 100644 (file)
@@ -137,6 +137,11 @@ static struct netstrfns  libcfs_netstrfns[] = {
          .nf_modname  = "kgmlnd", 
          .nf_addr2str = libcfs_num_addr2str, 
          .nf_str2addr = libcfs_num_str2addr},
+        {.nf_type     = PTLLND,
+         .nf_name     = "ptl",
+         .nf_modname  = "kptllnd",
+         .nf_addr2str = libcfs_num_addr2str,
+         .nf_str2addr = libcfs_num_str2addr},         
         /* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
         {.nf_type     = -1},
 };
index c6dc4b4..96ccb34 100644 (file)
@@ -25,6 +25,7 @@ const char *get_ev_type_string(int evtype)
 }
 
 static volatile int seen = 0;
+static volatile int seen_unlink = 0;
 
 static inline void handler(lnet_event_t *ev)
 {
@@ -40,4 +41,5 @@ static inline void handler(lnet_event_t *ev)
         PJK_UT_MSG("md.user_ptr=%p\n",ev->md.user_ptr);
         PJK_UT_MSG("-------- EVENT END --------------\n");
         ++seen;
+        if(ev->unlinked)++seen_unlink;
 }
index 9655985..33088c7 100644 (file)
@@ -1,8 +1,8 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
- #define UT_MSG_MODULE_NAME "utcli " 
+
+ #define UT_MSG_MODULE_NAME "utcli "
  #include "ut.h"
 
 int pkt_size = 300;
@@ -46,7 +46,7 @@ static int __init utcli_init(void)
         PORTAL_ALLOC (buffer, pkt_size);
         if (buffer == NULL)
         {
-                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size);
+                CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
                 return -ENOMEM;
         }
 
@@ -100,9 +100,9 @@ static int __init utcli_init(void)
 
         target.pid = 0;
         target.nid = libcfs_str2nid(nid);
-                
+
         PJK_UT_MSG("target.nid="LPX64"\n",target.nid);
-        
+
         for(i=0;i<1;i++)
         {
                 if(get){
@@ -141,7 +141,7 @@ static int __init utcli_init(void)
         while(i++ < 10 && seen == 0)
                 cfs_pause(cfs_time_seconds(1));
         if(seen == 0)
-                PJK_UT_MSG("------------------TIMEDOUT--------------------\n");                        
+                PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
         else{
                 int good;
                 if(get){
@@ -153,12 +153,12 @@ static int __init utcli_init(void)
                 }else{
                         good = 1;
                 }
-                
+
                 if(good)
                         PJK_UT_MSG("------------------COMPLETE--------------------\n");
                 else
-                        PJK_UT_MSG("------------------TIMEDOUT--------------------\n");                                
-        }                
+                        PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
+        }
 
 
 
@@ -172,6 +172,15 @@ static int __init utcli_init(void)
 exit5:
         PJK_UT_MSG("LNetMDUnlink()\n");
         LNetMDUnlink(mdh);
+
+        if(!seen_unlink){
+                PJK_UT_MSG("------------Waiting for UNLINK ------------\n");
+                i=0;
+                while(i++ < 120 && seen_unlink == 0)
+                        cfs_pause(cfs_time_seconds(1));
+        }
+
+        cfs_pause(cfs_time_seconds(1));
 exit4:
         PJK_UT_MSG("LNetEQFree()\n");
         LNetEQFree(eqh);
@@ -189,7 +198,7 @@ exit0:
 static void /*__exit*/ utcli_cleanup(void)
 {
         PJK_UT_MSG(">>>\n");
-        PJK_UT_MSG("<<<\n");        
+        PJK_UT_MSG("<<<\n");
 } /* utcli_cleanup() */
 
 
index 2341681..73a3c7b 100644 (file)
@@ -3,7 +3,7 @@
  */
 
 
-#define UT_MSG_MODULE_NAME "utsrv " 
+#define UT_MSG_MODULE_NAME "utsrv "
 #include "ut.h"
 
 
@@ -23,32 +23,32 @@ static int __init utsrv_init(void)
         lnet_process_id_t       anypid;
         lnet_process_id_t       mypid;
         lnet_md_t               md;
-        
+
         PJK_UT_MSG(">>>\n");
-        PJK_UT_MSG("pkt_size=%d\n",pkt_size);        
+        PJK_UT_MSG("pkt_size=%d\n",pkt_size);
         PJK_UT_MSG("auto_unlink=%d\n",auto_unlink);
-        
+
         PJK_UT_MSG("PORTAL_ALLOC\n");
         PORTAL_ALLOC (buffer, pkt_size);
         if (buffer == NULL)
         {
-                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size);
+                CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
                 rc = -ENOMEM;
                 goto exit0;
-        }        
-  
+        }
+
         PJK_UT_MSG("LNetNiInit()\n");
         rc = LNetNIInit(0);
         if (rc < 0)
         {
                 CERROR ("LNetNIInit: error %d\n", rc);
                 goto exit1;
-        }        
-        
+        }
+
         LNetGetId(0,&mypid);
         PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid);
-        PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);        
-        
+        PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);
+
         PJK_UT_MSG("LNetEQAlloc\n");
         rc = LNetEQAlloc(
                 64,      /* max number of envents why 64? */
@@ -57,8 +57,8 @@ static int __init utsrv_init(void)
         if(rc != 0) {
                 CERROR("LNetEQAlloc failed %d\n",rc);
                 goto exit2;
-        }        
-        
+        }
+
         anypid.nid = LNET_NID_ANY;
         anypid.pid = LNET_PID_ANY;
 
@@ -75,8 +75,8 @@ static int __init utsrv_init(void)
         if(rc != 0) {
                 CERROR("LNetMeAttach failed %d\n",rc);
                 goto exit3;
-        }        
-        
+        }
+
         md.start = buffer;
         md.length = pkt_size;
         md.threshold = auto_unlink ? 1 : 100;
@@ -97,26 +97,26 @@ static int __init utsrv_init(void)
         if(rc != 0){
                 CERROR("LNetMDAttach failed %d\n",rc);
                 goto exit4;
-        }        
-        
+        }
+
         rc = 0;
         goto exit0;
-        
-exit4:        
+
+exit4:
         PJK_UT_MSG("LNetMEUnlink()\n");
         LNetMEUnlink(meh);
-exit3:        
+exit3:
         PJK_UT_MSG("LNetEQFree()\n");
         LNetEQFree(eqh);
 exit2:
         PJK_UT_MSG("LNetNiFini()\n");
         LNetNIFini();
 exit1:
-        PORTAL_FREE(buffer,pkt_size);             
+        PORTAL_FREE(buffer,pkt_size);
 exit0:
         PJK_UT_MSG("<<< rc=%d\n",rc);
         return rc;
-        
+
 } /* utsrv_init() */
 
 
@@ -126,12 +126,12 @@ static void /*__exit*/ utsrv_cleanup(void)
         PJK_UT_MSG("LNetMDUnlink()\n");
         LNetMDUnlink(mdh);
         PJK_UT_MSG("LNetMEUnlink()\n");
-        LNetMEUnlink(meh);        
+        LNetMEUnlink(meh);
         PJK_UT_MSG("LNetEQFree()\n");
-        LNetEQFree(eqh);        
+        LNetEQFree(eqh);
         PJK_UT_MSG("LNetNiFini()\n");
         LNetNIFini();
-        PORTAL_FREE(buffer,pkt_size);             
+        PORTAL_FREE(buffer,pkt_size);
         PJK_UT_MSG("<<<\n");
 } /* utsrv_cleanup() */