MODULES := kptllnd
-kptllnd-objs := ptllnd.o
+EXTRA_POST_CFLAGS := @PTLLNDCPPFLAGS@
-EXTRA_PRE_CFLAGS := @PTLLNDCPPFLAGS@
+kptllnd-objs := ptllnd_rx_buf.o ptllnd_tx.o ptllnd.o ptllnd_cb.o ptllnd_modparams.o ptllnd_peer.o
@INCLUDE_RULES@
--- /dev/null
+1. This version of the Portals LND is intended to work on the Cray XT3 using
+ Cray Portals as a network transport.
+
+2. To enable the building of the Portals LND (ptllnd.ko) configure with the
+ following option:
+ ./configure --with-portals=<path-to-portals-headers>
+
+3. The following configuration options are supported
+
+ ntx:
+ The total number of message descritprs
+
+ concurrent_peers:
+ The maximum number of conncurent peers. Peers attemting
+ to connect beyond the maximum will not be allowd.
+
+ peer_hash_table_size:
+ The number of hash table slots for the peers. This number
+ should scale with concurrent_peers.
+
+ cksum:
+ Set to non-zero to enable message (not RDMA) checksums for
+ outgoing packets. Incoming packets will always be checksumed
+ if necssary, independnt of this value.
+
+ timeout:
+ The amount of time a request can linger in a peers active
+ queue, before the peer is considered dead. Units: seconds.
+
+ portal:
+ The portal ID to use for the ptllnd traffic.
+
+ rxb_npages:
+ The number of pages in a RX Buffer.
+
+ credits:
+ The maximum total number of concurrent sends that are
+ outstanding at any given instant.
+
+ peercredits:
+ The maximum number of concurrent sends that are
+ outstanding to a single piere at any given instant.
+
+ max_immd_size:
+ The maximum immedate message size. This MUST be
+ the same on all nodes in a cluster. A peer connecting
+ with a diffrent max_immd_size will be rejected.
endif
MOSTLYCLEANFILES = *.o *.ko *.mod.c
-DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h
+DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h ptllnd_wire.h
#include "ptllnd.h"
-lnet_handle_ni_t nih;\r
-\r
-\r
+
+lnd_t kptllnd_lnd = {
+ .lnd_type = PTLLND,
+ .lnd_startup = kptllnd_startup,
+ .lnd_shutdown = kptllnd_shutdown,
+ .lnd_ctl = kptllnd_ctl,
+ .lnd_send = kptllnd_send,
+ .lnd_recv = kptllnd_recv,
+};
+
+kptl_data_t kptllnd_data;
+kptl_stats_t kptllnd_stats;
+
+void kptllnd_shutdown (lnet_ni_t *ni);
+
+void ptllnd_assert_wire_constants (void)
+{
+ /* TBD - auto generated */
+}
+
+__u32
+kptllnd_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ /* ensure I don't return 0 (== no checksum) */
+ return (sum == 0) ? 1 : sum;
+}
+
+void
+kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob)
+{
+ msg->ptlm_type = type;
+ msg->ptlm_nob = offsetof(kptl_msg_t, ptlm_u) + body_nob;
+}
+
+void
+kptllnd_msg_pack(
+ kptl_msg_t *msg,
+ int credits,
+ lnet_nid_t dstnid,
+ __u64 dststamp,
+ __u64 seq,
+ kptl_data_t *kptllnd_data)
+{
+ msg->ptlm_magic = PTLLND_MSG_MAGIC;
+ msg->ptlm_version = PTLLND_MSG_VERSION;
+ /* msg->ptlm_type Filled in kptllnd_init_msg() */
+ msg->ptlm_credits = credits;
+ /* msg->ptlm_nob Filled in kptllnd_init_msg() */
+ msg->ptlm_cksum = 0;
+ msg->ptlm_srcnid = kptllnd_data->kptl_ni->ni_nid;
+ msg->ptlm_srcstamp = kptllnd_data->kptl_incarnation;
+ msg->ptlm_dstnid = dstnid;
+ msg->ptlm_dststamp = dststamp;
+ msg->ptlm_seq = seq;
+
+ if (*kptllnd_tunables.kptl_cksum) {
+ /* NB ptlm_cksum zero while computing cksum */
+ msg->ptlm_cksum = kptllnd_cksum(msg, msg->ptlm_nob);
+ }
+}
+
+int
+kptllnd_msg_unpack(kptl_msg_t *msg, int nob,kptl_data_t *kptllnd_data)
+{
+ const int hdr_size = offsetof(kptl_msg_t, ptlm_u);
+ __u32 msg_cksum;
+ int flip;
+ int msg_nob;
+
+ /* 6 bytes are enough to have received magic + version */
+ if (nob < 6) {
+ CERROR("Very Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ /*
+ * Determine if we need to flip
+ */
+ if (msg->ptlm_magic == PTLLND_MSG_MAGIC) {
+ flip = 0;
+ } else if (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC)) {
+ flip = 1;
+ } else {
+ CERROR("Bad magic: %08x\n", msg->ptlm_magic);
+ return -EPROTO;
+ }
+
+ if (msg->ptlm_version !=
+ (flip ? __swab16(PTLLND_MSG_VERSION) : PTLLND_MSG_VERSION)) {
+ CERROR("Bad version: got %d expected %d\n",
+ msg->ptlm_version,PTLLND_MSG_VERSION);
+ return -EPROTO;
+ }
+
+ if (nob < hdr_size) {
+ CERROR("Short header: got %d, wanted at least %d\n",
+ nob, hdr_size);
+ return -EPROTO;
+ }
+
+ msg_nob = flip ? __swab32(msg->ptlm_nob) : msg->ptlm_nob;
+ if (nob != msg_nob) {
+ CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+ return -EPROTO;
+ }
+
+ /* checksum must be computed with
+ * 1) ptlm_cksum zero and
+ * 2) BEFORE anything gets modified/flipped
+ */
+ msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum;
+ msg->ptlm_cksum = 0;
+ if (msg_cksum != 0){
+ STAT_UPDATE(kps_incoming_checksums_calculated);
+ if( msg_cksum != kptllnd_cksum(msg, msg_nob) ) {
+ STAT_UPDATE(kps_incoming_checksums_invalid);
+ CERROR("Bad checksum\n");
+ return -EPROTO;
+ }
+ }
+
+ /* Restore the checksum */
+ msg->ptlm_cksum = msg_cksum;
+
+ if(flip){
+ /* leave magic unflipped as a clue to peer endianness */
+ __swab16s(&msg->ptlm_version);
+ /* These two are 1 byte long so we don't swap them
+ But check this assumtion*/
+ CLASSERT (sizeof(msg->ptlm_type) == 1);
+ CLASSERT (sizeof(msg->ptlm_credits) == 1);
+ msg->ptlm_nob = msg_nob;
+ __swab64s(&msg->ptlm_srcnid);
+ __swab64s(&msg->ptlm_srcstamp);
+ __swab64s(&msg->ptlm_dstnid);
+ __swab64s(&msg->ptlm_dststamp);
+ __swab64s(&msg->ptlm_seq);
+
+ switch(msg->ptlm_type)
+ {
+ case PLTLND_MSG_TYPE_PUT:
+ case PTLLND_MSG_TYPE_GET:
+ __swab64s(&msg->ptlm_u.req.kptlrm_matchbits);
+ break;
+ case PTLLND_MSG_TYPE_IMMEDIATE:
+ case PTLLND_MSG_TYPE_NOOP:
+ /* Do nothing */
+ break;
+ case PTLLND_MSG_TYPE_HELLO:
+ __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits);
+ __swab32s(&msg->ptlm_u.hello.kptlhm_max_immd_size);
+ break;
+ default:
+ CERROR("Bad message type: %d\n", msg->ptlm_type);
+ return -EPROTO;
+ }
+ }
+
+ /*
+ * Src nid can not be ANY
+ */
+ if (msg->ptlm_srcnid == PTL_NID_ANY) {
+ CERROR("Bad src nid: "LPX64"\n", msg->ptlm_srcnid);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+
+
+int
+kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ struct portal_ioctl_data *data = arg;
+ int rc = -EINVAL;
+ kptl_data_t *kptllnd_data = ni->ni_data;
+
+ PJK_UT_MSG(">>> kptllnd_ctl cmd=%u arg=%p\n",cmd,arg);
+
+ /*
+ * Validate that the context block is actually
+ * pointing to this interface
+ */
+ LASSERT (ni == kptllnd_data->kptl_ni);
+
+ switch(cmd) {
+ case IOC_PORTAL_DEL_PEER: {
+ rc = kptllnd_peer_del (kptllnd_data,data->ioc_nid);
+ break;
+ }
+ /*
+ * Not Supported - This is Legacy stuff
+ case IOC_PORTAL_GET_PEER:
+ case IOC_PORTAL_ADD_PEER:
+ case IOC_PORTAL_GET_CONN:
+ case IOC_PORTAL_CLOSE_CONNECTION:
+ case IOC_PORTAL_REGISTER_MYNID:
+ */
+ default:
+ CERROR("Unsupported IOCTL command %d\n",cmd);
+ rc=-EINVAL;
+ break;
+ }
+ PJK_UT_MSG("<<< kptllnd_ctl rc=%d\n",rc);
+ return rc;
+}
+
+void
+kptllnd_posted_object_setup(
+ kptl_posted_object_t* posted_obj,
+ kptl_data_t *kptllnd_data,
+ int type)
+{
+ /*
+ * Setup back pointer to LND instance data
+ */
+ posted_obj->po_kptllnd_data = kptllnd_data;
+
+ /*
+ * Setup descriptor type
+ */
+ posted_obj->po_flags.pof_type = type;
+}
+
+int
+kptllnd_startup (lnet_ni_t *ni)
+{
+ int rc;
+ int i;
+ struct timeval tv;
+ kptl_data_t *kptllnd_data;
+ ptl_err_t ptl_rc;
+
+
+ PJK_UT_MSG(">>>\n");
+
+ LASSERT (ni->ni_lnd == &kptllnd_lnd);
+
+ PORTAL_ALLOC (kptllnd_data,sizeof(*kptllnd_data));
+ if (kptllnd_data == NULL){
+ CERROR ("Failed to allocate memory for PTLLND context\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * zero pointers, flags etc
+ * put everything into a known state.
+ */
+ memset (kptllnd_data, 0, sizeof (*kptllnd_data));
+ kptllnd_data->kptl_eqh = PTL_INVALID_HANDLE;
+ kptllnd_data->kptl_nih = PTL_INVALID_HANDLE;
+
+ /*
+ * Uptick the module reference count
+ */
+ PORTAL_MODULE_USE;
+
+ /*
+ * Setup pointers between the ni and context data block
+ */
+ kptllnd_data->kptl_ni = ni;
+ ni->ni_data = kptllnd_data;
+
+ /*
+ * Setup Credits
+ */
+ ni->ni_maxtxcredits = *kptllnd_tunables.kptl_credits;
+ ni->ni_peertxcredits = *kptllnd_tunables.kptl_peercredits;
+
+
+ /*
+ * Initialize the Network interface instance
+ * We use the default because we don't have any
+ * way to choose a better interface.
+ * Requested and actual limits are ignored.
+ */
+ ptl_rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL,
+ &kptllnd_data->kptl_nih);
+
+ /*
+ * Note: PTL_IFACE_DUP simply means that the requested
+ * interface was already inited and that we're sharing it.
+ * Which is ok.
+ */
+ if (ptl_rc != PTL_OK && ptl_rc != PTL_IFACE_DUP){
+ CERROR ("PtlNIInit: error %d\n", ptl_rc);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ ptl_rc = PtlEQAlloc(
+ kptllnd_data->kptl_nih,
+ 8, /* We use callback - no need for max */
+ kptllnd_eq_callback, /* handler callback */
+ &kptllnd_data->kptl_eqh); /* output handle */
+ if(ptl_rc != 0) {
+ CERROR("PtlEQAlloc failed %d\n",ptl_rc);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ /*
+ * Fetch the lower NID
+ */
+ if(ptl_rc != PtlGetId(kptllnd_data->kptl_nih,&kptllnd_data->kptl_portals_id)){
+ CERROR ("PtlGetID: error %d\n", ptl_rc);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ PJK_UT_MSG("lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
+
+ /*
+ * Create the new NID. Based on the LND network type
+ * and the lower ni's address data.
+ */
+ ni->ni_nid = ptl2lnetnid(kptllnd_data,kptllnd_data->kptl_portals_id.nid);
+
+ PJK_UT_MSG("ptl nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid);
+ PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid);
+
+ CDEBUG(D_INFO,"ptl nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid);
+ CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid);
+
+ /*
+ * Initialized the incarnation
+ */
+ do_gettimeofday(&tv);
+ kptllnd_data->kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ PJK_UT_MSG("Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
+ CDEBUG(D_INFO,"Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation);
+
+ /*
+ * Setup the sched locks/lists/waitq
+ */
+ spin_lock_init (&kptllnd_data->kptl_sched_lock);
+ init_waitqueue_head (&kptllnd_data->kptl_sched_waitq);
+ INIT_LIST_HEAD (&kptllnd_data->kptl_sched_txq);
+ INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxq);
+ INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxbq);
+
+ /*
+ * Setup the tx locks/lists/waitq
+ */
+ spin_lock_init (&kptllnd_data->kptl_tx_lock);
+ INIT_LIST_HEAD (&kptllnd_data->kptl_idle_txs);
+ INIT_LIST_HEAD (&kptllnd_data->kptl_idle_nblk_txs);
+ init_waitqueue_head(&kptllnd_data->kptl_idle_tx_waitq);
+
+ /*
+ * Allocate and setup the peer hash table
+ */
+ PJK_UT_MSG("Allocate Peer Hash Table\n");
+ rwlock_init(&kptllnd_data->kptl_peer_rw_lock);
+ kptllnd_data->kptl_peer_hash_size = *kptllnd_tunables.kptl_peer_hash_table_size;
+ INIT_LIST_HEAD(&kptllnd_data->kptl_canceled_peers);
+ PORTAL_ALLOC (kptllnd_data->kptl_peers,
+ sizeof (struct list_head) * kptllnd_data->kptl_peer_hash_size);
+ if (kptllnd_data->kptl_peers == NULL) {
+ CERROR("Failed to allocate space for peer hash table size=%d\n",
+ kptllnd_data->kptl_peer_hash_size);
+ rc = -ENOMEM;
+ goto failed;
+ }
+ for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kptllnd_data->kptl_peers[i]);
+
+ /* lists/ptrs/locks initialised */
+ kptllnd_data->kptl_init = PTLLND_INIT_DATA;
+
+ /*****************************************************/
+
+ /*
+ * Start the scheduler threads for handling incoming
+ * requests. No need to advance the state because
+ * this will be automatically cleaned up now that PTLNAT_INIT_DATA
+ * state has been entered
+ */
+ PJK_UT_MSG("starting %d scheduler threads\n",PTLLND_N_SCHED);
+ for (i = 0; i < PTLLND_N_SCHED; i++) {
+ rc = kptllnd_thread_start (
+ kptllnd_scheduler,
+ i,
+ kptllnd_data);
+ if (rc != 0) {
+ CERROR("Can't spawn scheduler[%d]: %d\n", i, rc);
+ goto failed;
+ }
+ }
+
+ /*
+ * Allocate space for the tx descriptors
+ * (Note we don't need to advance the init state
+ * because we'll use the pointer being NULL as a sentry
+ * to know that we have to clean this up
+ */
+ PJK_UT_MSG("Allocate TX Descriptor array\n");
+ PORTAL_ALLOC (kptllnd_data->kptl_tx_descs,
+ PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+ if (kptllnd_data->kptl_tx_descs == NULL){
+ CERROR ("Can't allocate space for TX Descriptor array count=%d\n",
+ PTLLND_TX_MSGS());
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ /*
+ * Now setup the tx descriptors
+ */
+ rc = kptllnd_setup_tx_descs(kptllnd_data);
+ if (rc != 0) {
+ CERROR ("Can\'t setup tx descs: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag TX descs initialised */
+ kptllnd_data->kptl_init = PTLLND_INIT_TXD;
+
+ /*****************************************************/
+
+
+ kptllnd_rx_buffer_pool_init(&kptllnd_data->kptl_rx_buffer_pool);
+
+ /* flag rx descs initialised */
+ kptllnd_data->kptl_init = PTLLND_INIT_RXD;
+
+ /*****************************************************/
+
+
+ kptllnd_data->kptl_rx_cache = cfs_mem_cache_create (
+ "ptllnd_rx",
+ sizeof(kptl_rx_t) + *kptllnd_tunables.kptl_max_immd_size,
+ 0, /* offset */
+ 0, /* flags */
+ NULL,NULL); /* CTOR/DTOR */
+ if( kptllnd_data->kptl_rx_cache == 0 ){
+ CERROR("Can't create slab for RX descriptrs\n");
+ goto failed;
+ }
+
+ rc = kptllnd_rx_buffer_pool_reserve(
+ &kptllnd_data->kptl_rx_buffer_pool,
+ kptllnd_data,
+ *kptllnd_tunables.kptl_concurrent_peers);
+ if( rc != 0) {
+ CERROR("Can't reserve RX Buffer pool: %d\n",rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kptllnd_data->kptl_init = PTLLND_INIT_ALL;
+
+
+ /*****************************************************/
+
+ PJK_UT_MSG("<<< kptllnd_startup SUCCESS\n");
+ return 0;
+
+ failed:
+ CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n",rc);
+ kptllnd_shutdown (ni);
+ PJK_UT_MSG("<<< kptllnd_startup rc=%d\n",rc);
+ return rc;
+}
+
+void
+kptllnd_shutdown (lnet_ni_t *ni)
+{
+ int i;
+ kptl_data_t *kptllnd_data = ni->ni_data;
+
+ PJK_UT_MSG(">>> kptllnd_shutdown\n");
+
+ /*
+ * Validate that the context block is actually
+ * pointing to this interface
+ */
+ LASSERT (ni == kptllnd_data->kptl_ni);
+
+ CDEBUG(D_MALLOC, "before LND cleanup: kmem %d\n",
+ atomic_read (&libcfs_kmemory));
+
+ /*
+ * Now depending on where we are in the initialization
+ * cleanup the context block
+ */
+ switch (kptllnd_data->kptl_init) {
+
+ case PTLLND_INIT_ALL:
+ case PTLLND_INIT_RXD:
+ PJK_UT_MSG("PTLLND_INIT_RXD\n");
+
+ kptllnd_rx_buffer_pool_fini(
+ &kptllnd_data->kptl_rx_buffer_pool);
+
+ LASSERT(list_empty(&kptllnd_data->kptl_sched_rxq));
+ LASSERT(list_empty(&kptllnd_data->kptl_sched_rxbq));
+
+ /* fall through */
+ case PTLLND_INIT_TXD:
+ PJK_UT_MSG("PTLLND_INIT_TXD\n");
+
+ /*
+ * If there were peers started up then
+ * clean them up.
+ */
+ if( atomic_read(&kptllnd_data->kptl_npeers) != 0) {
+ PJK_UT_MSG("Deleting %d peers\n",atomic_read(&kptllnd_data->kptl_npeers));
+
+ /* nuke all peers */
+ kptllnd_peer_del(kptllnd_data,PTL_NID_ANY);
+
+ i = 2;
+ while (atomic_read (&kptllnd_data->kptl_npeers) != 0) {
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d peers to terminate\n",
+ atomic_read (&kptllnd_data->kptl_npeers));
+ PJK_UT_MSG("Waiting for %d peers to terminate\n",
+ atomic_read (&kptllnd_data->kptl_npeers));
+ cfs_pause(cfs_time_seconds(1));
+ }
+ }
+
+ LASSERT(list_empty(&kptllnd_data->kptl_canceled_peers));
+ PJK_UT_MSG("All peers deleted\n");
+
+ /*
+ * Set the shutdown flag
+ */
+ kptllnd_data->kptl_shutdown = 1;
+
+ /*
+ * First thing we do is shutdown the scheduler threads
+ * It makes cleanup easier to not have to worry about races
+ * with N other threads.
+ *
+ * Also this is safe no matter the kptl_init state
+ * because it is a nop because kptl_nthreads==0
+ * if we are not in the right state.
+ */
+ if(atomic_read (&kptllnd_data->kptl_nthreads) != 0){
+ PJK_UT_MSG("Stopping %d threads\n",atomic_read(&kptllnd_data->kptl_nthreads));
+ /*
+ * Wake up all the schedulers
+ */
+ wake_up_all (&kptllnd_data->kptl_sched_waitq);
+
+ i = 2;
+ while (atomic_read (&kptllnd_data->kptl_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read (&kptllnd_data->kptl_nthreads));
+ PJK_UT_MSG("Waiting for %d threads to terminate\n",
+ atomic_read (&kptllnd_data->kptl_nthreads));
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ }
+ PJK_UT_MSG("All Threads stopped\n");
+
+
+ LASSERT(list_empty(&kptllnd_data->kptl_sched_txq));
+
+ kptllnd_cleanup_tx_descs(kptllnd_data);
+
+ /* fall through */
+ case PTLLND_INIT_DATA:
+
+ PJK_UT_MSG("PTLLND_INIT_DATA\n");
+
+ LASSERT (atomic_read(&kptllnd_data->kptl_npeers) == 0);
+ LASSERT (kptllnd_data->kptl_peers != NULL);
+ for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++) {
+ LASSERT (list_empty (&kptllnd_data->kptl_peers[i]));
+ }
+ /*
+ * Nothing here now, but libcfs might soon require
+ * us to explicitly destroy wait queues and semaphores
+ * that would be done here
+ */
+
+ /* fall through */
+
+ case PTLLND_INIT_NOTHING:
+ PJK_UT_MSG("PTLLND_INIT_NOTHING\n");
+ break;
+ }
+
+ /*
+ * There are a number of things that can be done
+ * outside the state machine, because the construction
+ * (or lack thereof) can be determined directly from
+ * the pointer or handle itself.
+ * Clean these things up here
+ */
+
+ /*
+ * Cleanup the portals EQ
+ */
+ if(!PtlHandleIsEqual(kptllnd_data->kptl_eqh,PTL_INVALID_HANDLE))
+ PtlEQFree(kptllnd_data->kptl_eqh);
+
+ /*
+ * release the portals ni handle
+ */
+ if(!PtlHandleIsEqual(kptllnd_data->kptl_nih,PTL_INVALID_HANDLE))
+ PtlNIFini(kptllnd_data->kptl_nih);
+
+ /*
+ * Free the tx descriptors
+ */
+ if (kptllnd_data->kptl_tx_descs != NULL)
+ PORTAL_FREE(kptllnd_data->kptl_tx_descs,
+ PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+
+ /*
+ * Cleanup the RX descriptor slab
+ */
+ if (kptllnd_data->kptl_rx_cache != NULL)
+ cfs_mem_cache_destroy( kptllnd_data->kptl_rx_cache);
+
+ /*
+ * Cleanup the peer hash table
+ */
+ if (kptllnd_data->kptl_peers != NULL){
+ PORTAL_FREE (kptllnd_data->kptl_peers,
+ sizeof (struct list_head) *
+ kptllnd_data->kptl_peer_hash_size);
+ }
+
+ /*
+ * And free the context block
+ */
+ PORTAL_FREE(kptllnd_data,sizeof(*kptllnd_data));
+
+ CDEBUG(D_MALLOC, "after LND cleanup: kmem %d\n",
+ atomic_read (&libcfs_kmemory));
+
+ PORTAL_MODULE_UNUSE;
+ PJK_UT_MSG("<<<\n");
+}
+
+int __init
+kptllnd_module_init (void)
+{
+ int rc;
+
+ PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
+
+ /*
+ * Display the module parameters
+ */
+ CDEBUG(D_INFO,"ntx = %d\n",*kptllnd_tunables.kptl_ntx);
+ CDEBUG(D_INFO,"ntx_nblk = %d\n",*kptllnd_tunables.kptl_ntx_nblk);
+ CDEBUG(D_INFO,"concurrent_peers = %d\n",*kptllnd_tunables.kptl_concurrent_peers);
+ CDEBUG(D_INFO,"cksum = %d\n",*kptllnd_tunables.kptl_cksum);
+ CDEBUG(D_INFO,"portal = %d\n",*kptllnd_tunables.kptl_portal);
+ CDEBUG(D_INFO,"timeout = %d (seconds)\n",*kptllnd_tunables.kptl_timeout);
+ CDEBUG(D_INFO,"rxb_npages = %d\n",*kptllnd_tunables.kptl_rxb_npages);
+ CDEBUG(D_INFO,"credits = %d\n",*kptllnd_tunables.kptl_credits);
+ CDEBUG(D_INFO,"peercredits = %d\n",*kptllnd_tunables.kptl_peercredits);
+ CDEBUG(D_INFO,"max_immd_size = %d\n",*kptllnd_tunables.kptl_max_immd_size);
+
+ ptllnd_assert_wire_constants();
+
+ rc = kptllnd_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ lnet_register_lnd(&kptllnd_lnd);
+
+ PJK_UT_MSG("<<<\n");
+ return 0;
+}
+
void __exit
kptllnd_module_fini (void)
{
+
PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);
- PtlNIFini(nih);
+ lnet_unregister_lnd(&kptllnd_lnd);
+ kptllnd_tunables_fini();
+ kpttllnd_get_stats();
PJK_UT_MSG("<<<\n");
}
-int __init
-kptllnd_module_init (void)
+#define DO_TYPE(x) case x: return #x;
+
+const char *get_ev_type_string(int type)
{
- int rc = 0;\r
- lnet_process_id_t portals_id;
- PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__);\r
- \r
- PJK_UT_MSG("PtlNIInit\n");
- rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL, &nih);\r
- if (rc != PTL_OK && rc != PTL_IFACE_DUP){
- /*CERROR ("PtlNIInit: error %d\n", rc);*/
- goto failed;
+ switch(type)
+ {
+ DO_TYPE(PTL_EVENT_GET_START);
+ DO_TYPE(PTL_EVENT_GET_END);
+ DO_TYPE(PTL_EVENT_PUT_START);
+ DO_TYPE(PTL_EVENT_PUT_END);
+ DO_TYPE(PTL_EVENT_REPLY_START);
+ DO_TYPE(PTL_EVENT_REPLY_END);
+ DO_TYPE(PTL_EVENT_ACK);
+ DO_TYPE(PTL_EVENT_SEND_START);
+ DO_TYPE(PTL_EVENT_SEND_END);
+ DO_TYPE(PTL_EVENT_UNLINK);
+ default:
+ return "";
}
- \r
- PJK_UT_MSG("PtlGetId\n");
- if(rc != PtlGetId(nih,&portals_id)){
- /*CERROR ("PtlGetID: error %d\n", rc);*/
- }else{\r
- PJK_UT_MSG("ptl nid=" LPX64 "\n",portals_id.nid);
- }
-
-failed:
- PJK_UT_MSG("<<<\n");
- return rc;
-}\r
-\r
+}
+
+const char *get_msg_type_string(int type)
+{
+ switch(type)
+ {
+ DO_TYPE(PTLLND_MSG_TYPE_INVALID);
+ DO_TYPE(PLTLND_MSG_TYPE_PUT);
+ DO_TYPE(PTLLND_MSG_TYPE_GET);
+ DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE);
+ DO_TYPE(PTLLND_MSG_TYPE_HELLO);
+ default:
+ return "";
+ }
+}
+
+#define LOGSTAT(x) PJK_UT_MSG_ALWAYS("%30.30s %d\n",#x,kptllnd_stats.x);
+
+kptl_stats_t* kpttllnd_get_stats(void)
+{
+ LOGSTAT(kps_incoming_checksums_calculated);
+ LOGSTAT(kps_incoming_checksums_invalid);
+ LOGSTAT(kps_cleaning_caneled_peers);
+ LOGSTAT(kps_checking_buckets);
+ LOGSTAT(kps_too_many_peers);
+ LOGSTAT(kps_peers_created);
+ LOGSTAT(kps_no_credits);
+ LOGSTAT(kps_saving_last_credit);
+ LOGSTAT(kps_rx_allocated);
+ LOGSTAT(kps_rx_released);
+ LOGSTAT(kps_rx_allocation_failed);
+ LOGSTAT(kps_tx_allocated);
+ LOGSTAT(kps_tx_released);
+ LOGSTAT(kpt_tx_allocation_failed);
+ LOGSTAT(kpx_recv_delayed);
+ LOGSTAT(kpx_send_routing);
+ LOGSTAT(kpx_send_target_is_router);
+
+ return &kptllnd_stats;
+}
+
MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Portals LND v1.00");\r
-/*MODULE_LICENSE("GPL");*/
+MODULE_DESCRIPTION("Kernel Portals LND v1.00");
+MODULE_LICENSE("GPL");
module_init(kptllnd_module_init);
module_exit(kptllnd_module_fini);
#include <linux/list.h>
#include <linux/kmod.h>
#include <linux/sysctl.h>
-#include <linux/random.h>\r
-\r
+#include <linux/random.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+
+#define DEBUG_SUBSYSTEM S_NAL
+
#include <libcfs/kp30.h>
#include <lnet/lnet.h>
-#include <lnet/lib-lnet.h>\r
-\r
+#include <lnet/lib-lnet.h>
#include <portals/p30.h>
-\r
-#define PJK_UT_MSG(fmt...) do{printk("<1>ptllnd: %-30s ",__FUNCTION__);printk(fmt);}while(0)
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+/*
+ * Define this to enable console debug logging
+ * and simulation
+ */
+//#define PJK_DEBUGGING
+
+/*
+ * This was used for some single node testing
+ * which has some hacks to allow packets that come
+ * back on the lookback LND to have their address
+ * fixed up, so that match MD's properly. And you
+ * can setup a connection with your self and transfer data.
+ * WARNING: This was for UNIT testing purposes only.
+ */
+//#define TESTING_WITH_LOOPBACK
+
+
+
+#define PTL_RESERVED_MATCHBITS 0x100 /* below this value is reserved
+ * above is for bult data transfer */
+#define LNET_MSG_MATCHBITS 0 /* the value for the message channel */
+
+#if CONFIG_SMP
+# define PTLLND_N_SCHED num_online_cpus() /* # schedulers */
+#else
+# define PTLLND_N_SCHED 1 /* # schedulers */
+#endif
+
+
+
+/* defaults for modparams/tunables */
+#define PTLLND_NTX 32 /* # tx descs */
+#define PTLLND_NTX_NBLK 256 /* # reserved tx descs */
+#define PTLLND_NRX (64 * num_online_cpus()) /* # rx desc */
+#define PTLLND_CONCURRENT_PEERS 1152 /* # nodes all talking at once to me */
+#define PTLLND_CKSUM 0 /* checksum kptl_msg_t? 0 = Diabled */
+#define PTLLND_TIMEOUT 50 /* default comms timeout (seconds) */
+#define PTLLND_PORTAL 9 /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_RXB_NPAGES 1 /* Number of pages for a single RX Buffer */
+#define PTLLND_CREDITS 256 /* concurrent sends */
+#define PTLLND_PEERCREDITS 8 /* concurrent sends to 1 peer*/
+#define PTLLND_MAX_MSG_SIZE 512 /* Maximum message size */
+#define PTLLND_PEER_HASH_SIZE 101 /* # of buckets in peer hash table */
+
+/* tunables fixed at compile time */
+#define PTLLND_CREDIT_HIGHWATER (*kptllnd_tunables.kptl_peercredits-1) /* when to eagerly return credits */
+#define PTLLND_TIMEOUT_SEC 3 /* How often we check a subset of the peer hash table for timeout*/
+
+/************************/
+/* derived constants... */
+/* TX messages (shared by all connections) */
+#define PTLLND_TX_MSGS() (*kptllnd_tunables.kptl_ntx + \
+ *kptllnd_tunables.kptl_ntx_nblk)
+
+
+typedef struct
+{
+ int *kptl_ntx; /* # tx descs */
+ int *kptl_ntx_nblk; /* # reserved tx descs */
+ int *kptl_concurrent_peers; /* max # nodes all talking to me */
+ int *kptl_cksum; /* checksum kptl_msg_t? */
+ int *kptl_timeout; /* comms timeout (seconds) */
+ int *kptl_portal; /* portal number */
+ int *kptl_rxb_npages; /* number of pages for rx buffer */
+ int *kptl_credits; /* number of credits */
+ int *kptl_peercredits; /* number of credits */
+ int *kptl_max_immd_size; /* max immd message size*/
+ int *kptl_peer_hash_table_size; /* # slots in peer hash table */
+
+#ifdef PJK_DEBUGGING
+ int *kptl_simulation_bitmap;/* simulation bitmap */
+#endif
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+ struct ctl_table_header *kptl_sysctl; /* sysctl interface */
+#endif
+} kptl_tunables_t;
+
+
+
+#include "ptllnd_wire.h"
+
+/***********************************************************************/
+
+typedef struct kptl_data kptl_data_t;
+typedef struct kptl_rx_buffer kptl_rx_buffer_t;
+typedef struct kptl_peer kptl_peer_t;
+
+#define POSTED_OBJECT_TYPE_RESERVED 0
+#define POSTED_OBJECT_TYPE_TX 1
+#define POSTED_OBJECT_TYPE_RXB 2
+
+typedef struct
+{
+ __u32 pof_type : 2;
+}kptl_posted_object_flags_t;
+
+typedef struct kptl_posted_object
+{
+ kptl_data_t *po_kptllnd_data; /* LND Instance Data */
+ kptl_posted_object_flags_t po_flags; /* flags and state */
+} kptl_posted_object_t;
+
+typedef struct kptl_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ atomic_t rx_refcount;
+ kptl_rx_buffer_t *rx_rxb; /* the rx buffer pointer */
+ kptl_msg_t *rx_msg;
+ int rx_nob; /* the number of bytes rcvd */
+ ptl_process_id_t rx_initiator; /* who send the packet */
+ kptl_peer_t *rx_peer; /* pointer to peer */
+ size_t rx_payload[0]; /* payload QQQ*/
+} kptl_rx_t;
+
+typedef struct kptl_rx_buffer_pool
+{
+ spinlock_t rxbp_lock;
+ struct list_head rxbp_list;
+ int rxbp_count; /* the number of elements in the list */
+ int rxbp_reserved; /* the number currently reserved */
+ int rxbp_shutdown; /* the shutdown flag for the pool */
+ int rxbp_posted; /* the number of elements posted */
+}kptl_rx_buffer_pool_t;
+
+typedef enum
+{
+ RXB_STATE_UNINITIALIZED = 0,
+ RXB_STATE_IDLE = 1,
+ RXB_STATE_POSTED = 2,
+}kptl_rxb_state_t;
+
+struct kptl_rx_buffer
+{
+ /* NB - becuase this buffer is assigned to a MD's usr_ptr
+ * It MUST have kptl_posted_object_t as the first member
+ * so that the real type of the element can be determined
+ */
+ kptl_posted_object_t rxb_po;
+ kptl_rx_buffer_pool_t *rxb_pool;
+ struct list_head rxb_list; /* for the rxb_pool list */
+ struct list_head rxb_repost_list;/* for the kptl_sched_rxbq list*/
+ kptl_rxb_state_t rxb_state; /* the state of this rx buffer*/
+ atomic_t rxb_refcount; /* outstanding rx */
+ ptl_handle_md_t rxb_mdh; /* the portals memory descriptor (MD) handle */
+ void *rxb_buffer; /* the buffer */
+
+};
+
+typedef enum
+{
+ TX_STATE_UNINITIALIZED = 0,
+ TX_STATE_ON_IDLE_QUEUE = 1,
+ TX_STATE_ALLOCATED = 2,
+ TX_STATE_WAITING_CREDITS = 3,
+ TX_STATE_WAITING_RESPONSE = 4
+}kptl_tx_state_t;
+
+typedef enum
+{
+ TX_TYPE_RESERVED = 0,
+ TX_TYPE_SMALL_MESSAGE = 1,
+ TX_TYPE_LARGE_PUT = 2,
+ TX_TYPE_LARGE_GET = 3,
+ TX_TYPE_LARGE_PUT_RESPONSE = 4,
+ TX_TYPE_LARGE_GET_RESPONSE = 5,
+}kptl_tx_type_t;
+
+typedef struct kptl_tx /* transmit message */
+{
+ /* NB - becuase this buffer is assigned to a MD's usr_ptr
+ * It MUST have kptl_posted_object_t as the first member
+ * so that the real type of the element can be determined
+ */
+ kptl_posted_object_t tx_po;
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ struct list_head tx_schedlist; /* queue on idle_txs ibc_tx_queue etc. */
+ atomic_t tx_refcount; /* Posted Buffer refrences count*/
+ kptl_tx_state_t tx_state; /* the state of this tx descriptor */
+ int tx_seen_send_end; /* if we've seen a SEND_END event */
+ int tx_seen_reply_end; /* if we've seen a REPLY_END event */
+ kptl_tx_type_t tx_type; /* type of transfer */
+ int tx_status; /* the status of this tx descriptor */
+ int tx_isnblk; /* I'm reserved for non-blocking sends */
+ ptl_handle_md_t tx_mdh; /* the portals memory descriptor (MD) handle */
+ ptl_handle_md_t tx_mdh_msg; /* the portals MD handle for the initial message */
+ lnet_msg_t *tx_ptlmsg; /* the cookie for finalize */
+ lnet_msg_t *tx_ptlmsg_reply; /* the cookie for the reply message */
+ kptl_msg_t *tx_msg; /* the message data */
+ kptl_peer_t *tx_peer; /* the peer this is waiting on */
+ unsigned long tx_deadline; /* deadline */
+ kptl_rx_t *tx_associated_rx; /* Associated RX for Bulk RDMA */
+
+ unsigned int tx_payload_niov;
+ struct iovec *tx_payload_iov;
+ lnet_kiov_t *tx_payload_kiov;
+ unsigned int tx_payload_offset;
+ int tx_payload_nob;
+} kptl_tx_t;
+
+
+typedef enum
+{
+ PEER_STATE_UNINITIALIZED = 0,
+ PEER_STATE_WAITING_HELLO = 1,
+ PEER_STATE_ACTIVE = 2,
+ PEER_STATE_CANCELED = 3,
+}kptllnd_peer_state_t;
+
+struct kptl_peer
+{
+ struct list_head peer_list;
+ atomic_t peer_refcount; /* The current refrences */
+ kptllnd_peer_state_t peer_state;
+ kptl_data_t *peer_kptllnd_data; /* LND Instance Data */
+ spinlock_t peer_lock; /* serialize */
+ struct list_head peer_pending_txs; /* queue of pending txs */
+ struct list_head peer_active_txs; /* queue of activce txs */
+ int peer_active_txs_change_counter;/* updated when peer_active_txs changes*/
+ lnet_nid_t peer_nid; /* who's on the other end(s) */
+ __u64 peer_incarnation; /* peer's incarnation */
+ __u64 peer_tx_seqnum; /* next seq# to send with*/
+ int peer_credits; /* number of send credits */
+ int peer_outstanding_credits;/* number of peer credits */
+ __u64 peer_next_matchbits; /* Next value to use for tx desc matchbits */
+ __u64 peer_last_matchbits_seen; /* last matchbits seen*/
+};
+
+
+
+struct kptl_data
+{
+ int kptl_init; /* initialisation state */
+ volatile int kptl_shutdown; /* shut down? */
+ atomic_t kptl_nthreads; /* # live threads */
+ lnet_ni_t *kptl_ni; /* _the_ LND instance */
+ ptl_handle_ni_t kptl_nih; /* network inteface handle */
+ ptl_process_id_t kptl_portals_id; /* Portals ID of interface */
+ __u64 kptl_incarnation; /* which one am I */
+ ptl_handle_eq_t kptl_eqh; /* Event Queue (EQ) */
+
+ spinlock_t kptl_sched_lock; /* serialise the next 3 members*/
+ wait_queue_head_t kptl_sched_waitq; /* schedulers sleep here */
+ struct list_head kptl_sched_txq; /* tx requiring attention */
+ struct list_head kptl_sched_rxq; /* rx requiring attention */
+ struct list_head kptl_sched_rxbq; /* rxb requiring reposting */
+
+ kptl_rx_buffer_pool_t kptl_rx_buffer_pool; /* rx buffer pool */
+ cfs_mem_cache_t* kptl_rx_cache; /* rx descripter cache */
+
+ struct kptl_tx *kptl_tx_descs; /* the tx descriptors array */
+ spinlock_t kptl_tx_lock; /* serialise the next 4 members*/
+ struct list_head kptl_idle_txs; /* idle tx descriptors */
+ struct list_head kptl_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kptl_idle_tx_waitq; /* block here for tx descriptor */
+
+ rwlock_t kptl_peer_rw_lock; /* lock for peer table */
+ struct list_head *kptl_peers; /* hash table of all my known peers */
+ struct list_head kptl_canceled_peers; /* peers in the canceld state */
+ int kptl_canceled_peers_counter; /* updated when canceled_peers is modified*/
+ int kptl_peer_hash_size; /* size of kptl_peers */
+ atomic_t kptl_npeers; /* # peers extant */
+
+};
+
+typedef struct kptl_stats
+{
+ int kps_incoming_checksums_calculated;
+ int kps_incoming_checksums_invalid;
+ int kps_cleaning_caneled_peers; /* MP Safe*/
+ int kps_checking_buckets;
+ int kps_too_many_peers; /* MP Safe*/
+ int kps_peers_created; /* MP Safe*/
+ int kps_no_credits;
+ int kps_saving_last_credit;
+ int kps_rx_allocated;
+ int kps_rx_released;
+ int kps_rx_allocation_failed;
+ int kps_tx_allocated; /* MP Safe*/
+ int kps_tx_released; /* MP Safe*/
+ int kpt_tx_allocation_failed; /* MP Safe*/
+ int kpx_recv_delayed;
+ int kpx_send_routing;
+ int kpx_send_target_is_router;
+}kptl_stats_t;
+
+/*
+ * Note: Stats update are not atomic (for performance reasons)
+ * and therefore not MP safe. They are more an indiciation of
+ * things that are going on, as opposed to a actual count.
+ *
+ * (e.g. if kps_checking_buckets wasn't incrementing at some
+ * number per second, that would be an indication that the
+ * scheduler thread is stuck or stopped)
+ *
+ * However where possible the update of the stats are placed inside
+ * a spinlock to make them consistent, these are marked MP Safe above.
+ *
+ */
+#define STAT_UPDATE(n) do{ ++kptllnd_stats.n; }while(0)
+
+
+enum
+{
+ PTLLND_INIT_NOTHING = 0,
+ PTLLND_INIT_DATA = 1,
+ PTLLND_INIT_TXD = 2,
+ PTLLND_INIT_RXD = 3,
+ PTLLND_INIT_ALL = 4,
+};
+
+
+extern kptl_tunables_t kptllnd_tunables;
+extern kptl_stats_t kptllnd_stats;
+
+int kptllnd_startup (
+ lnet_ni_t *ni);
+
+void kptllnd_shutdown (
+ lnet_ni_t *ni);
+
+int kptllnd_ctl(
+ lnet_ni_t *ni,
+ unsigned int cmd,
+ void *arg);
+
+int kptllnd_send (
+ lnet_ni_t *ni,
+ void *private,
+ lnet_msg_t *lntmsg);
+
+int kptllnd_recv (
+ lnet_ni_t *ni,
+ void *private,
+ lnet_msg_t *lntmsg,
+ int delayed,
+ unsigned int niov,
+ struct iovec *iov,
+ lnet_kiov_t *kiov,
+ unsigned int offset,
+ unsigned int mlen,
+ unsigned int rlen);
+
+void kptllnd_eq_callback(
+ ptl_event_t *evp);
+
+int kptllnd_scheduler(
+ void *arg);
+
+int kptllnd_thread_start(
+ int (*fn)(void *arg),
+ int id,
+ kptl_data_t *kptllnd_data);
+
+int kptllnd_tunables_init(void);
+void kptllnd_tunables_fini(void);
+
+const char *get_ev_type_string(
+ int evtype);
+
+const char *get_msg_type_string(
+ int type);
+
+kptl_stats_t* kpttllnd_get_stats(void);
+
+void
+kptllnd_posted_object_setup(
+ kptl_posted_object_t* posted_obj,
+ kptl_data_t *kptllnd_data,
+ int type);
+
+/*
+ * RX BUFFER SUPPORT FUNCTIONS
+ */
+
+void
+kptllnd_rx_buffer_pool_init(
+ kptl_rx_buffer_pool_t *rxbp);
+
+void
+kptllnd_rx_buffer_pool_fini(
+ kptl_rx_buffer_pool_t *rxbp);
+
+int
+kptllnd_rx_buffer_pool_reserve(
+ kptl_rx_buffer_pool_t *rxbp,
+ kptl_data_t *kptllnd_data,
+ int count);
+
+void
+kptllnd_rx_buffer_pool_unreserve(
+ kptl_rx_buffer_pool_t *rxbp,
+ int count);
+
+void
+kptllnd_rx_buffer_callback(
+ ptl_event_t *ev);
+
+void
+kptllnd_rx_buffer_scheduled_post(
+ kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_post_handle_error(
+ kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_decref(
+ kptl_rx_buffer_t *rxb,
+ const char *owner);
+
+/*
+ * RX SUPPORT FUNCTIONS
+ */
+void
+kptllnd_rx_scheduler_handler(
+ kptl_rx_t *rx);
+
+void
+kptllnd_rx_addref(
+ kptl_rx_t *rx,
+ const char *owner);
+
+void
+kptllnd_rx_decref(
+ kptl_rx_t *rx,
+ const char *owner,
+ kptl_data_t *kptllnd_data);
+
+/*
+ * PEER SUPPORT FUNCTIONS
+ */
+void
+kptllnd_peer_decref (
+ kptl_peer_t *peer,
+ const char *owner);
+void
+kptllnd_peer_addref (
+ kptl_peer_t *peer,
+ const char *owner);
+
+int
+kptllnd_peer_del (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid);
+
+void
+kptllnd_peer_cancel(
+ kptl_peer_t *peer);
+
+void
+kptllnd_peer_queue_tx (
+ kptl_peer_t *peer,
+ kptl_tx_t *tx);
+
+void
+kptllnd_peer_queue_bulk_rdma_tx_locked(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx);
+
+void
+kptllnd_peer_dequeue_tx(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx);
+void
+kptllnd_peer_dequeue_tx_locked(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx);
+
+int
+kptllnd_peer_connect (
+ kptl_tx_t *tx,
+ lnet_nid_t nid );
+
+void
+kptllnd_peer_check_sends (
+ kptl_peer_t *peer );
+void
+kptllnd_peer_check_bucket (
+ int idx,
+ kptl_data_t *kptllnd_data);
+
+void
+kptllnd_tx_launch (
+ kptl_tx_t *tx,
+ lnet_nid_t target_nid,
+ lnet_msg_t *ptlmsg );
+
+kptl_peer_t *
+kptllnd_peer_find (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid);
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid,
+ kptl_msg_t *msg);
+
+static inline struct list_head *
+kptllnd_nid2peerlist (kptl_data_t *kptllnd_data,lnet_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % kptllnd_data->kptl_peer_hash_size;
+
+ return (&kptllnd_data->kptl_peers [hash]);
+}
+
+/*
+ * TX SUPPORT FUNCTIONS
+ */
+int
+kptllnd_setup_tx_descs (
+ kptl_data_t *kptllnd_data);
+
+void
+kptllnd_cleanup_tx_descs(
+ kptl_data_t *kptllnd_data);
+
+void
+kptllnd_tx_addref(
+ kptl_tx_t *tx);
+void
+kptllnd_tx_decref(
+ kptl_tx_t *tx);
+void
+kptllnd_tx_scheduled_decref(
+ kptl_tx_t *tx);
+void
+kptllnd_tx_done (
+ kptl_tx_t *tx);
+kptl_tx_t *
+kptllnd_get_idle_tx(
+ kptl_data_t *kptllnd_data,
+ int may_block,
+ kptl_tx_type_t purpose);
+
+void
+kptllnd_tx_callback(
+ ptl_event_t *ev);
+
+/*
+ * MESSAGE SUPPORT FUNCTIONS
+ */
+void
+kptllnd_init_msg(
+ kptl_msg_t *msg,
+ int type,
+ int body_nob);
+
+void
+kptllnd_msg_pack(
+ kptl_msg_t *msgp,
+ int credits,
+ lnet_nid_t dstnid,
+ __u64 dststamp,
+ __u64 seq,
+ kptl_data_t *kptllnd_data);
+
+int
+kptllnd_msg_unpack(
+ kptl_msg_t *msg,
+ int nob,
+ kptl_data_t *kptllnd_data);
+
+/*
+ * MISC SUPPORT FUNCTIONS
+ */
+
+typedef union {
+ struct iovec iov[PTL_MD_MAX_IOV];
+ ptl_kiov_t kiov[PTL_MD_MAX_IOV];
+}tempiov_t;
+
+void
+kptllnd_setup_md(
+ kptl_data_t *kptllnd_data,
+ ptl_md_t *md,
+ unsigned int op,
+ kptl_tx_t *tx,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ lnet_kiov_t *payload_kiov,
+ unsigned int payload_offset,
+ int payload_nob,
+ tempiov_t *tempiov);
+
+int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data);
+int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data);
+int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data);
+
+static inline lnet_nid_t ptl2lnetnid(kptl_data_t *kptllnd_data,ptl_nid_t portals_nid)
+{
+ return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_ni->ni_nid), PTL_NIDADDR(portals_nid) );
+}
+
+static inline ptl_nid_t lnet2ptlnid(kptl_data_t *kptllnd_data,lnet_nid_t lnet_nid)
+{
+ return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_portals_id.nid), PTL_NIDADDR(lnet_nid) );
+}
+
+#ifdef PJK_DEBUGGING
+
+#define PJK_UT_MSG_ALWAYS(fmt, a...) \
+do{ \
+ printk("<1>ptllnd:%-30s:%u:",__FUNCTION__,cfs_curproc_pid()); \
+ printk(fmt,## a); \
+ CDEBUG(D_TRACE,fmt,## a); \
+}while(0)
+
+#define PJK_UT_MSG_SIMULATION(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a )
+
+
+#if 1
+#define PJK_UT_MSG_DATA(fmt, a...) do{}while(0)
+#else
+#define PJK_UT_MSG_DATA(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a )
+#endif
+
+#if 1
+#define PJK_UT_MSG(fmt, a...) do{}while(0)
+#else
+#define PJK_UT_MSG(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a )
+#endif
+
+
+#define SIMULATION_FAIL_BLOCKING_TX_PUT_ALLOC 0 /* 0x00000001 */
+#define SIMULATION_FAIL_BLOCKING_TX_GET_ALLOC 1 /* 0x00000002 */
+#define SIMULATION_FAIL_BLOCKING_TX 2 /* 0x00000004 */
+#define SIMULATION_FAIL_BLOCKING_RX_ALLOC 3 /* 0x00000008 */
+
+#define IS_SIMULATION_ENABLED(x) \
+ (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
+
+
+#else
+
+
+#define PJK_UT_MSG_ALWAYS(fmt, a...) do{}while(0)
+#define PJK_UT_MSG_SIMULATION(fmt, a...) do{}while(0)
+#define PJK_UT_MSG_DATA(fmt, a...) do{}while(0)
+#define PJK_UT_MSG(fmt, a...) do{}while(0)
+
+#define IS_SIMULATION_ENABLED(x) 0
+
+#endif
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "ptllnd.h"
+
+void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data);
+
+void
+kptllnd_setup_md(
+ kptl_data_t *kptllnd_data,
+ ptl_md_t *md,
+ unsigned int op,
+ kptl_tx_t *tx,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ lnet_kiov_t *payload_kiov,
+ unsigned int payload_offset,
+ int payload_nob,
+ tempiov_t *tempiov)
+{
+ unsigned int niov = 0;
+
+ PJK_UT_MSG_DATA("%s nob=%d offset=%d niov=%d\n",
+ op == PTL_MD_OP_GET ? "GET" : "PUT",
+ payload_nob,payload_offset,payload_niov);
+
+ /* One but not both of iov or kiov must be NULL (XOR) */
+ LASSERT( (payload_iov != 0 && payload_kiov == 0) ||
+ (payload_iov == 0 && payload_kiov != 0 ) );
+
+ /* We have a put or get operation*/
+ LASSERT( op == PTL_MD_OP_GET || op == PTL_MD_OP_PUT);
+
+
+ /* Only one operation then unlink */
+ md->threshold = 1;
+
+ /* Get operations need threshold +1 to handle the
+ * reply operation
+ */
+ if( op == PTL_MD_OP_GET)
+ md->threshold++;
+
+ /* setup the options*/
+ md->options = op;
+
+ /* If this is a PUT then we need to disable ACK */
+ /* we don't need an ACK, we'll get a callback when it is complete */
+ if( op == PTL_MD_OP_PUT)
+ md->options |= PTL_MD_ACK_DISABLE;
+
+ /* we don't care about the start event */
+ md->options |= PTL_MD_EVENT_START_DISABLE;
+
+ /* point back to this TX descriptor so we know what to complete
+ * when the event is triggered */
+ md->user_ptr = tx;
+
+ md->eq_handle = kptllnd_data->kptl_eqh;
+ if (payload_iov != NULL){
+
+ while (payload_offset >= payload_iov->iov_len) {
+ payload_offset -= payload_iov->iov_len;
+ payload_iov++;
+ payload_niov--;
+ LASSERT (payload_niov > 0);
+ }
+
+ while(payload_nob){
+ LASSERT( payload_offset < payload_iov->iov_len);
+ LASSERT (payload_niov > 0);
+ LASSERT (niov < sizeof(tempiov->iov)/sizeof(tempiov->iov[0]));
+
+ tempiov->iov[niov].iov_base = payload_iov->iov_base + payload_offset;
+ tempiov->iov[niov].iov_len = min((int)(payload_iov->iov_len - payload_offset),
+ (int)payload_nob);
+
+ payload_offset = 0;
+ payload_nob -= tempiov->iov[niov].iov_len;
+ payload_iov++;
+ payload_niov--;
+ niov++;
+ }
+
+ md->start = tempiov->iov;
+ md->options |= PTL_MD_IOVEC;
+
+ }else{
+
+
+ while (payload_offset >= payload_kiov->kiov_len) {
+ payload_offset -= payload_kiov->kiov_len;
+ payload_kiov++;
+ payload_niov--;
+ LASSERT (payload_niov > 0);
+ }
+
+ while(payload_nob){
+ LASSERT( payload_offset < payload_kiov->kiov_len);
+ LASSERT (payload_niov > 0);
+ LASSERT (niov < sizeof(tempiov->kiov)/sizeof(tempiov->kiov[0]));
+
+ tempiov->kiov[niov].kiov_page = payload_kiov->kiov_page;
+ tempiov->kiov[niov].kiov_offset = payload_kiov->kiov_offset + payload_offset;
+ tempiov->kiov[niov].kiov_len = min((int)(payload_kiov->kiov_len - payload_offset),
+ (int)payload_nob);
+
+ payload_offset = 0;
+ payload_nob -= tempiov->kiov[niov].kiov_len;
+ payload_kiov++;
+ payload_niov--;
+ niov++;
+ }
+
+ md->start = tempiov->kiov;
+ md->options |= PTL_MD_KIOV;
+ }
+
+ /*
+ * When using PTL_MD_IOVEC or PTL_MD_KIOV this is not
+ * length, rather it is # iovs
+ */
+ md->length = niov;
+}
+
+int
+kptllnd_start_bulk_rdma(
+ kptl_data_t *kptllnd_data,
+ kptl_rx_t *rx,
+ lnet_msg_t *lntmsg,
+ unsigned int op,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ lnet_kiov_t *payload_kiov,
+ unsigned int payload_offset,
+ int payload_nob)
+{
+ kptl_tx_t *tx;
+ ptl_md_t md;
+ ptl_err_t ptl_rc;
+ ptl_err_t ptl_rc2;
+ int rc;
+ tempiov_t tempiov;
+ kptl_msg_t *rxmsg = rx->rx_msg;
+ kptl_peer_t *peer = rx->rx_peer;
+
+
+ /*
+ * Get an idle tx descriptor
+ * may NOT block: (That's the "0" param)
+ */
+ LASSERT(op == PTL_MD_OP_GET || op == PTL_MD_OP_PUT);
+ tx = kptllnd_get_idle_tx(kptllnd_data,0,
+ op == PTL_MD_OP_GET ? TX_TYPE_LARGE_PUT_RESPONSE :
+ TX_TYPE_LARGE_GET_RESPONSE);
+ if(tx == NULL){
+ CERROR ("Can't start bulk rdma %d to "LPX64": tx descs exhausted\n",
+ op, rx->rx_initiator.nid);
+ return -ENOMEM;
+ }
+
+ /*
+ * Attach the RX to the TX and take a refrence
+ */
+ tx->tx_associated_rx = rx;
+ kptllnd_rx_addref(rx,"tx");
+
+ PJK_UT_MSG_DATA(">>> %s rx=%p associated with tx=%p\n",
+ op == PTL_MD_OP_GET ? "GET" : "PUT",
+ rx,tx);
+ PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+ rxmsg->ptlm_u.req.kptlrm_matchbits);
+
+ /*
+ * Setup the MD
+ */
+ kptllnd_setup_md(kptllnd_data,&md,op,tx,
+ payload_niov,payload_iov,payload_kiov,
+ payload_offset,payload_nob,&tempiov);
+
+ spin_lock(&peer->peer_lock);
+
+ /*
+ * Attach the MD
+ */
+ ptl_rc = PtlMDBind(
+ kptllnd_data->kptl_nih,
+ md,
+ PTL_UNLINK,
+ &tx->tx_mdh);
+ if(ptl_rc != PTL_OK){
+ CERROR("PtlMDBind failed %d\n",ptl_rc);
+
+ spin_unlock(&peer->peer_lock);
+ /*
+ * Just drop the ref for this MD because it was never
+ * posted to portals
+ */
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ rc = -ENOMEM;
+ goto end;
+ }
+
+ /*
+ * And save the portals message
+ */
+ tx->tx_ptlmsg = lntmsg;
+
+ /*
+ * Queue the request on the peer
+ */
+ kptllnd_peer_queue_bulk_rdma_tx_locked(peer,tx);
+
+ /*
+ * Grab a ref so the TX doesn't dissappear
+ */
+ kptllnd_tx_addref(tx);
+
+ spin_unlock(&peer->peer_lock);
+
+
+ /*
+ * Do the Put
+ */
+ if( op == PTL_MD_OP_PUT)
+ {
+ ptl_rc = PtlPut (
+ tx->tx_mdh,
+ PTL_NOACK_REQ, /* we dont need an ack */
+ rx->rx_initiator, /* peer "address" */
+ *kptllnd_tunables.kptl_portal, /* portal */
+ 0, /* cookie */
+ rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */
+ 0, /* offset - unused */
+ 0); /* header data */
+ }else{
+ ptl_rc = PtlGet (
+ tx->tx_mdh,
+ rx->rx_initiator, /* peer "address" */
+ *kptllnd_tunables.kptl_portal, /* portal */
+ 0, /* cookie */
+ rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */
+ 0); /* offset - unused*/
+ }
+
+ if(ptl_rc != PTL_OK){
+ CERROR("Ptl%s failed: %d\n",
+ op == PTL_MD_OP_GET ? "Get" : "Put",ptl_rc);
+
+ spin_lock(&peer->peer_lock);
+
+ /*
+ * Unlink the MD because it's not yet in use
+ * this should happen immediately
+ */
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ ptl_rc2 = PtlMDUnlink(tx->tx_mdh);
+ LASSERT(ptl_rc2 == PTL_OK);
+
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ /* If we have LUSTRE Portals UNLINK semantics
+ * we'll get an unlink event. If we have standard
+ * Portals semantics we decref the TX explicitly here
+ */
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+#endif
+ /*
+ * We are returning failure so we don't
+ * want tx_done to finalize the message
+ * so we set it to zero
+ */
+ tx->tx_ptlmsg = 0;
+
+ kptllnd_peer_dequeue_tx_locked(peer,tx);
+ tx->tx_peer = NULL;
+
+ spin_unlock(&peer->peer_lock);
+
+ rc = -ENOMEM;
+ goto end;
+ }
+
+ rc = 0;
+
+end:
+ /*
+ * Release our temporary reference
+ * (this one could be the last)
+ */
+ kptllnd_tx_decref(tx);
+
+ PJK_UT_MSG("<<< rc=%d\n",rc);
+ return rc;
+}
+
+
+void
+kptlnd_do_put(
+ kptl_tx_t *tx,
+ lnet_msg_t *lntmsg,
+ lnet_hdr_t *hdr,
+ kptl_data_t *kptllnd_data,
+ lnet_process_id_t target,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ lnet_kiov_t *payload_kiov,
+ unsigned int payload_offset,
+ unsigned int payload_nob)
+{
+ LASSERT(tx != NULL);
+
+ tx->tx_payload_niov = payload_niov;
+ tx->tx_payload_iov = payload_iov;
+ tx->tx_payload_kiov = payload_kiov;
+ tx->tx_payload_offset = payload_offset;
+ tx->tx_payload_nob = payload_nob;
+
+ tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr;
+ kptllnd_init_msg (tx->tx_msg,
+ PLTLND_MSG_TYPE_PUT,
+ sizeof(kptl_request_msg_t));
+ kptllnd_tx_launch(tx, target.nid,lntmsg);
+}
+
+int
+kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ lnet_hdr_t *hdr = &lntmsg->msg_hdr;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ int target_is_router = lntmsg->msg_target_is_router;
+ int routing = lntmsg->msg_routing;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct iovec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ kptl_tx_t *tx = NULL;
+ kptl_data_t *kptllnd_data = ni->ni_data;
+ int nob;
+ int rc;
+ kptl_rx_t *rx = NULL;
+
+ PJK_UT_MSG_DATA(">>> SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+ PJK_UT_MSG_DATA("nob=%d nov=%d offset=%d to %s r=%d\n",
+ payload_nob, payload_niov, payload_offset,
+ libcfs_id2str(target),
+ routing);
+
+ if(routing)
+ STAT_UPDATE(kpx_send_routing);
+ if(target_is_router)
+ STAT_UPDATE(kpx_send_target_is_router);
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+ /* Thread context */
+ LASSERT (!in_interrupt());
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+ /*
+ * we rely on this being true, as we only store hdr
+ * in the tx descriptor, and just ignore type
+ */
+ LASSERT(hdr->type == type);
+
+ switch (type) {
+ default:
+ LBUG();
+ return -EINVAL;
+
+ case LNET_MSG_PUT:
+ PJK_UT_MSG_DATA("LNET_MSG_PUT\n");
+
+ /*
+ * Get an idle tx descriptor
+ * may block: caller is app thread (That's the "1" param)
+ */
+ tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_PUT);
+ if(tx == NULL){
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+ type, target.nid);
+ return -ENOMEM;
+ }
+
+ /* Is the payload small enough not to need RDMA? */
+ nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+ if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+ break;
+
+ kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target,
+ payload_niov,payload_iov,
+ payload_kiov,payload_offset,payload_nob);
+
+ PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+ return 0;
+
+ case LNET_MSG_GET:
+
+ PJK_UT_MSG_DATA("LNET_MSG_GET nob=%d\n",lntmsg->msg_md->md_length);
+
+ /*
+ * Get an idle tx descriptor
+ * may block: caller is app thread (That's the "1" param)
+ */
+ tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_GET);
+ if(tx == NULL){
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+ type, target.nid);
+ return -ENOMEM;
+ }
+
+ /*
+ * If routing go immediate
+ */
+ if(target_is_router || routing)
+ break;
+
+ /* Is the payload small enough not to need RDMA? */
+ nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[lntmsg->msg_md->md_length]);
+ if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+ break;
+
+ tx->tx_payload_offset = 0;
+ tx->tx_payload_niov = lntmsg->msg_md->md_niov;
+ tx->tx_payload_nob = lntmsg->msg_md->md_length;
+
+ if((lntmsg->msg_md->md_options & PTL_MD_KIOV) != 0){
+ tx->tx_payload_iov = 0;
+ tx->tx_payload_kiov = lntmsg->msg_md->md_iov.kiov;
+ }else{
+ tx->tx_payload_iov = lntmsg->msg_md->md_iov.iov;
+ tx->tx_payload_kiov = 0;
+ }
+
+
+ tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr;
+ kptllnd_init_msg (tx->tx_msg,
+ PTLLND_MSG_TYPE_GET,
+ sizeof(kptl_request_msg_t));
+
+ tx->tx_ptlmsg_reply =
+ lnet_create_reply_msg(kptllnd_data->kptl_ni,lntmsg);
+
+ goto launch;
+
+ case LNET_MSG_ACK:
+ PJK_UT_MSG_DATA("LNET_MSG_ACK\n");
+ LASSERT (payload_nob == 0);
+ break;
+
+ case LNET_MSG_REPLY:
+ PJK_UT_MSG_DATA("LNET_MSG_REPLY\n");
+
+ /*
+ * Reply's private is the incoming rx descriptor
+ */
+ rx = private;
+ LASSERT(rx != NULL);
+
+ if(lntmsg==NULL)
+ {
+ /*
+ * Get an idle tx descriptor
+ * may NOT block That's the "0" param)
+ */
+ tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_LARGE_PUT);
+ if(tx == NULL){
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+ type, target.nid);
+ return -ENOMEM;
+ }
+
+ /* Is the payload small enough not to need RDMA? */
+ nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+ if (nob <= *kptllnd_tunables.kptl_max_immd_size)
+ break;
+
+ kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target,
+ payload_niov,payload_iov,
+ payload_kiov,payload_offset,payload_nob);
+
+ PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+ return 0;
+ }else{
+ /*
+ * If the request was to NOT do RDMA
+ * break out and just send back an IMMEDIATE message
+ */
+ if (rx->rx_msg->ptlm_type == PTLLND_MSG_TYPE_IMMEDIATE) {
+ /* RDMA not expected */
+ nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+ if (nob > *kptllnd_tunables.kptl_max_immd_size) {
+ CERROR("REPLY for "LPX64" too big but RDMA not requested:"
+ "%d (max for message is %d)\n",
+ target.nid, payload_nob,
+ *kptllnd_tunables.kptl_max_immd_size);
+ CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
+ nob, target.nid);
+ return -EINVAL;
+ }
+ break;
+ }
+
+
+ /* Incoming message consistent with RDMA? */
+ if (rx->rx_msg->ptlm_type != PTLLND_MSG_TYPE_GET) {
+ CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
+ target.nid, rx->rx_msg->ptlm_type);
+ return -EINVAL;
+ }
+
+ rc = kptllnd_start_bulk_rdma(
+ kptllnd_data,
+ rx,
+ lntmsg,
+ PTL_MD_OP_PUT,
+ payload_niov,
+ payload_iov,
+ payload_kiov,
+ payload_offset,
+ payload_nob);
+ PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS rc=%d\n",rc);
+ return rc;
+ }
+ }
+
+
+ if(tx == NULL){
+ PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+
+ /*
+ * Get an idle tx descriptor
+ * may NOT block: (That's the "0" param)
+ */
+ tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+ if(tx == NULL){
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted\n",
+ type, target.nid);
+ return -ENOMEM;
+ }
+ }else{
+ PJK_UT_MSG_DATA("Using PTLLND_MSG_TYPE_IMMEDIATE\n");
+ /*
+ * Repurpose this TX
+ */
+ tx->tx_type = TX_TYPE_SMALL_MESSAGE;
+
+ }
+
+ LASSERT (offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob])
+ <= *kptllnd_tunables.kptl_max_immd_size);
+
+ /*
+ * Setup the header
+ */
+ tx->tx_msg->ptlm_u.immediate.kptlim_hdr = *hdr;
+
+ if (payload_nob > 0) {
+ if (payload_kiov != NULL)
+ lnet_copy_kiov2flat(
+ *kptllnd_tunables.kptl_max_immd_size,
+ tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+ 0,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lnet_copy_iov2flat(
+ *kptllnd_tunables.kptl_max_immd_size,
+ tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+ 0,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ }
+
+ nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
+ kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE,nob);
+
+
+launch:
+ kptllnd_tx_launch(tx, target.nid,lntmsg);
+ PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n");
+ return 0;
+}
+
+int kptllnd_eager_recv(void *private,void **new_privatep)
+{
+ kptl_rx_t *rx = private;
+
+
+ LASSERT(rx->rx_nob < *kptllnd_tunables.kptl_max_immd_size);
+
+ /*
+ * Copy the data directly into the RX
+ */
+ memcpy(rx->rx_payload,rx->rx_msg,rx->rx_nob);
+
+ *new_privatep = rx;
+
+ /*
+ * Free the request buffer
+ * will repost of we are the last ones using it
+ */
+ LASSERT(rx->rx_rxb != NULL);
+ kptllnd_rx_buffer_decref(rx->rx_rxb,"rx-eager");
+ rx->rx_rxb = NULL;
+
+
+ /*
+ * Now point the msg buffer at the RX descriptor payload
+ * rather than the RXB (because that is now freed!
+ */
+ rx->rx_msg = (kptl_msg_t*)rx->rx_payload;
+
+ return 0;
+}
+
+
+int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ kptl_rx_t *rx = private;
+ kptl_msg_t *rxmsg = rx->rx_msg;
+ kptl_data_t *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data;
+ int nob;
+ int rc;
+
+ PJK_UT_MSG_DATA(">>> RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\n");
+ PJK_UT_MSG_DATA("niov=%d offset=%d mlen=%d rlen=%d\n",
+ niov,offset,mlen,rlen);
+
+ LASSERT (mlen <= rlen);
+ LASSERT (mlen >= 0);
+ LASSERT (!in_interrupt());
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ if(delayed)
+ STAT_UPDATE(kpx_recv_delayed);
+
+ switch(rxmsg->ptlm_type)
+ {
+ default:
+ LBUG();
+ rc = -EINVAL;
+ break;
+
+ case PTLLND_MSG_TYPE_IMMEDIATE:
+ PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n");
+
+ nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]);
+ if (nob > *kptllnd_tunables.kptl_max_immd_size) {
+ CERROR ("Immediate message from "LPX64" too big: %d\n",
+ rxmsg->ptlm_u.immediate.kptlim_hdr.src_nid, rlen);
+ rc = -EINVAL;
+ break;
+ }
+
+ if (kiov != NULL)
+ lnet_copy_flat2kiov(
+ niov, kiov, offset,
+ *kptllnd_tunables.kptl_max_immd_size,
+ rxmsg->ptlm_u.immediate.kptlim_payload,
+ 0,
+ mlen);
+ else
+ lnet_copy_flat2iov(
+ niov, iov, offset,
+ *kptllnd_tunables.kptl_max_immd_size,
+ rxmsg->ptlm_u.immediate.kptlim_payload,
+ 0,
+ mlen);
+
+ lnet_finalize (ni, lntmsg, 0);
+ rc = 0;
+ break;
+
+ case PTLLND_MSG_TYPE_GET:
+ PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_GET\n");
+ /* We get called here just to discard any junk after the
+ * GET hdr. */
+ LASSERT (lntmsg == NULL); /* What is this all about ???*/
+
+ lnet_finalize (ni, lntmsg, 0);
+
+ rc = 0;
+ break;
+
+ case PLTLND_MSG_TYPE_PUT:
+ PJK_UT_MSG_DATA("PLTLND_MSG_TYPE_PUT\n");
+
+ if (mlen == 0) { /* No payload */
+ lnet_finalize(ni, lntmsg, 0);
+ rc = 0;
+ }else{
+ rc = kptllnd_start_bulk_rdma(
+ kptllnd_data,
+ rx,
+ lntmsg,
+ PTL_MD_OP_GET,
+ niov,
+ iov,
+ kiov,
+ offset,
+ mlen);
+ }
+ break;
+ }
+
+ /*
+ * We're done with the RX
+ */
+ kptllnd_rx_decref(rx,"lnet_parse",kptllnd_data);
+
+ PJK_UT_MSG_DATA("<<< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR rc=%d\n",rc);
+ return rc;
+}
+
+
+void
+kptllnd_eq_callback(ptl_event_t *ev)
+{
+ kptl_posted_object_t *po = ev->md.user_ptr;
+
+ /*
+ * Just delegate to the correct callback
+ * based on object type
+ */
+ if(po->po_flags.pof_type == POSTED_OBJECT_TYPE_TX)
+ kptllnd_tx_callback(ev);
+ else
+ kptllnd_rx_buffer_callback(ev);
+}
+
+typedef struct
+{
+ int id; /* The unique ID */
+ kptl_data_t *kptllnd_data; /* pointer to the NAL instance */
+
+}kptllnd_thread_data_t;
+
+void
+kptllnd_thread_fini (kptllnd_thread_data_t *thread_data)
+{
+ atomic_dec (&thread_data->kptllnd_data->kptl_nthreads);
+ PORTAL_FREE(thread_data,sizeof(*thread_data));
+}
+
+int
+kptllnd_thread_start (int (*fn)(void *arg), int id,kptl_data_t *kptllnd_data)
+{
+ long pid;
+ kptllnd_thread_data_t *thread_data;
+
+ /*
+ * Allocate the tread data so we can pass more that
+ * one param to the thread function
+ */
+ PORTAL_ALLOC (thread_data,sizeof(*thread_data));
+ if(thread_data == 0){
+ CERROR("No memory to allocated thread data structure\n");
+ return 0;
+ }
+
+ atomic_inc (&kptllnd_data->kptl_nthreads);
+
+ /*
+ * Initialize thread data structure
+ */
+ thread_data->id = id;
+ thread_data->kptllnd_data = kptllnd_data;
+
+ pid = kernel_thread (fn, thread_data, 0);
+
+ /*
+ * On error cleanup the context explicitly
+ */
+ if (pid < 0){
+ CERROR("Failed to start kernel_thread id=%d\n",id);
+ kptllnd_thread_fini(thread_data);
+ return (int)pid;
+ }else{
+ return 0;
+ }
+}
+
+
+
+int
+kptllnd_scheduler(void *arg)
+{
+ kptllnd_thread_data_t *thread_data = arg;
+ int id = thread_data->id;
+ kptl_data_t *kptllnd_data = thread_data->kptllnd_data;
+ char name[16];
+ cfs_waitlink_t waitlink;
+ int bucket =0;
+ int buckets_to_check;
+ cfs_time_t last_check = cfs_time_current();
+ cfs_duration_t duration;
+ time_t duration_sec;
+
+ PJK_UT_MSG(">>>\n");
+
+ /*
+ * Daemonize
+ */
+ snprintf(name, sizeof(name), "kptllnd_sd_%02d", id);
+ libcfs_daemonize(name);
+
+ cfs_waitlink_init(&waitlink);
+
+ /*
+ * Keep going around
+ */
+ while(!kptllnd_data->kptl_shutdown) {
+
+ /*
+ * Wait on the scheduler waitq
+ */
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ cfs_waitq_add(&kptllnd_data->kptl_sched_waitq, &waitlink);
+ cfs_waitq_timedwait(&waitlink,cfs_time_seconds(PTLLND_TIMEOUT_SEC));
+ set_current_state (TASK_RUNNING);
+ cfs_waitq_del (&kptllnd_data->kptl_sched_waitq, &waitlink);
+
+
+ duration = cfs_time_sub(cfs_time_current(),last_check);
+ duration_sec = cfs_duration_sec(duration);
+
+ /*
+ * Check all the buckets over the kptl_timeout inteval
+ * but just determine what percenations we are supposed to be
+ * checking now.
+ * Example
+ * (duration/HZ) = 5 sec
+ * HASH_SHZE = 100
+ * kptl_timeout = 60 sec.
+ * Result = 8 buckets to be checked (actually 8.3)
+ */
+ buckets_to_check = duration_sec * kptllnd_data->kptl_peer_hash_size /
+ (*kptllnd_tunables.kptl_timeout);
+
+ if(buckets_to_check){
+ /*PJK_UT_MSG("Check Buckets %d\n",buckets_to_check);*/
+ STAT_UPDATE(kps_checking_buckets);
+
+ /*
+ * Because we round down the buckets we need to store
+ * the left over portion (.3 in the above example)
+ * somewhere so we don't
+ * lose it. Do this but updating the last check now
+ * to "now" but rather to some time less than "now" that
+ * takes into account the routing error.
+ */
+ last_check = cfs_time_add( last_check,
+ cfs_time_seconds(buckets_to_check *
+ *kptllnd_tunables.kptl_timeout /
+ kptllnd_data->kptl_peer_hash_size));
+
+ /*
+ * If we are supposed to check buckets then
+ * do that here.
+ */
+ while(buckets_to_check){
+ kptllnd_peer_check_bucket(bucket,kptllnd_data);
+ bucket = (bucket+1) % kptllnd_data->kptl_peer_hash_size;
+ --buckets_to_check;
+ }
+ }
+
+ /*
+ * Drain the RX queue
+ */
+ while(kptllnd_process_scheduled_rx(kptllnd_data)!=0);
+
+ /*
+ * Repost all RXBs
+ */
+ while(kptllnd_process_scheduled_rxb(kptllnd_data)!=0);
+
+ /*
+ * Drain the TX queue. Note RX's can cause new TX's
+ * to be added to the queue.
+ */
+ while(kptllnd_process_scheduled_tx(kptllnd_data)!=0);
+
+
+ /*
+ * Clean any canceled peers
+ */
+ kptllnd_clean_canceled_peers(kptllnd_data);
+ }
+
+ kptllnd_thread_fini(thread_data);
+ PJK_UT_MSG("<<<\n");
+ return (0);
+}
+
+
+int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data)
+{
+ kptl_tx_t *tx = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+ /*
+ * If the list is not empty, grab the first one
+ * and pull it off the list
+ */
+ if(!list_empty(&kptllnd_data->kptl_sched_txq)){
+ tx = list_entry (kptllnd_data->kptl_sched_txq.next,
+ kptl_tx_t, tx_schedlist);
+ list_del_init(&tx->tx_schedlist);
+ }
+
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+ if(tx){
+ PJK_UT_MSG(">>> tx=%p\n",tx);
+ kptllnd_tx_done(tx);
+ PJK_UT_MSG("<<<\n");
+ }
+
+ return tx!=NULL;
+}
+
+int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data)
+{
+ kptl_rx_t *rx = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+ /*
+ * If the list is not empty, grab the first one
+ * and pull it off the list
+ */
+ if(!list_empty(&kptllnd_data->kptl_sched_rxq)){
+ rx = list_entry (kptllnd_data->kptl_sched_rxq.next,
+ kptl_rx_t, rx_list);
+ list_del_init(&rx->rx_list);
+ }
+
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+ if(rx)
+ kptllnd_rx_scheduler_handler(rx);
+
+ return rx!=NULL;
+}
+
+int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data)
+{
+ kptl_rx_buffer_t *rxb = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+
+ /*
+ * If the list is not empty, grab the first one
+ * and pull it off the list
+ */
+ if(!list_empty(&kptllnd_data->kptl_sched_rxbq)){
+ rxb = list_entry (kptllnd_data->kptl_sched_rxbq.next,
+ kptl_rx_buffer_t, rxb_repost_list);
+ list_del_init(&rxb->rxb_repost_list);
+ }
+
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+
+ if(rxb)
+ kptllnd_rx_buffer_post_handle_error(rxb);
+
+ return rxb!=NULL;
+}
+
+void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data)
+{
+ unsigned long flags;
+ kptl_peer_t *peer;
+ struct list_head *iter;
+ int counter;
+
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+
+ if(!list_empty(&kptllnd_data->kptl_canceled_peers)){
+ PJK_UT_MSG("Cleaning Canceled Peers\n");
+ STAT_UPDATE(kps_cleaning_caneled_peers);
+ }
+
+again:
+ counter = kptllnd_data->kptl_canceled_peers_counter;
+
+ list_for_each(iter, &kptllnd_data->kptl_canceled_peers) {
+ peer = list_entry (iter, kptl_peer_t, peer_list);
+
+
+ /*
+ * Take reference so we can manipulate it
+ * outside the lock
+ * */
+ kptllnd_peer_addref(peer,"temp");
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ kptllnd_peer_cancel(peer);
+ kptllnd_peer_decref(peer,"temp");
+
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ /*
+ * if the list has changed then we need to start again
+ */
+ if(counter != kptllnd_data->kptl_canceled_peers_counter)
+ goto again;
+ }
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "ptllnd.h"
+
+static int ntx = PTLLND_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+ "# of 'normal' message descriptors");
+
+static int ntx_nblk = PTLLND_NTX_NBLK;
+CFS_MODULE_PARM(ntx_nblk, "i", int, 0444,
+ "# of 'reserved' message descriptors");
+
+static int concurrent_peers = PTLLND_CONCURRENT_PEERS;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+ "maximum number of peers that may connect");
+
+static int cksum = PTLLND_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+ "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = PTLLND_TIMEOUT;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+ "timeout (seconds)");
+
+static int portal = PTLLND_PORTAL;
+CFS_MODULE_PARM(portal, "i", int, 0444,
+ "portal id");
+
+static int rxb_npages = PTLLND_RXB_NPAGES;
+CFS_MODULE_PARM(rxb_npages, "i", int, 0444,
+ "# of pages for rx buffers");
+
+static int credits = PTLLND_CREDITS;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+ "concurrent sends");
+
+static int peercredits = PTLLND_PEERCREDITS;
+CFS_MODULE_PARM(peercredits, "i", int, 0444,
+ "concurrent sends to 1 peer");
+
+static int max_immd_size = PTLLND_MAX_MSG_SIZE;
+CFS_MODULE_PARM(max_immd_size, "i", int, 0444,
+ "max size of immediate message");
+
+static int peer_hash_table_size = PTLLND_PEER_HASH_SIZE;
+CFS_MODULE_PARM(peer_hash_table_size, "i", int, 0444,
+ "# of slots in the peer hash table");
+
+#ifdef PJK_DEBUGGING
+static int simulation_bitmap = 0;
+CFS_MODULE_PARM(simulation_bitmap, "i", int, 0444,
+ "simulation bitmap");
+#endif
+
+
+kptl_tunables_t kptllnd_tunables = {
+ .kptl_ntx = &ntx,
+ .kptl_ntx_nblk = &ntx_nblk,
+ .kptl_concurrent_peers = &concurrent_peers,
+ .kptl_cksum = &cksum,
+ .kptl_portal = &portal,
+ .kptl_timeout = &timeout,
+ .kptl_rxb_npages = &rxb_npages,
+ .kptl_credits = &credits,
+ .kptl_peercredits = &peercredits,
+ .kptl_max_immd_size = &max_immd_size,
+ .kptl_peer_hash_table_size = &peer_hash_table_size,
+#ifdef PJK_DEBUGGING
+ .kptl_simulation_bitmap = &simulation_bitmap,
+#endif
+};
+
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static ctl_table kptllnd_ctl_table[] = {
+ {1, "ntx", &ntx,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {2, "ntx_nblk", &ntx_nblk,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {3, "concurrent_peers", &concurrent_peers,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {4, "cksum", &cksum,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {5, "timeout", &timeout,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {6, "portal", &portal,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {7, "rxb_npages", &rxb_npages,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {8, "credits", &kptl_credits,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {9, "peercredits", &kptl_peercredits,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {10, "max_immd_size", &kptl_max_immd_size,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+ {11, "peer_hash_table_size,", &kptl_peer_hash_table_size,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+
+#ifdef PJK_DEBUGGING
+ {12, "simulation_bitmap,", &kptl_simulation_bitmap,
+ sizeof(int), 0444, NULL, &proc_dointvec},
+#endif
+
+ {0}
+};
+
+static ctl_table kptllnd_top_ctl_table[] = {
+ {203, "ptllnd", NULL, 0, 0555, kptllnd_ctl_table},
+ {0}
+};
+
+int
+kptllnd_tunables_init ()
+{
+ kptllnd_tunables.kptl_sysctl =
+ register_sysctl_table(kptllnd_top_ctl_table, 0);
+
+ if (kptllnd_tunables.kptl_sysctl == NULL)
+ CWARN("Can't setup /proc tunables\n");
+
+ return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+ if (kptllnd_tunables.kptl_sysctl != NULL)
+ unregister_sysctl_table(kptllnd_tunables.kptl_sysctl);
+}
+
+#else
+
+int
+kptllnd_tunables_init ()
+{
+ return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+}
+
+#endif
+
--- /dev/null
+#include "ptllnd.h"
+#include <libcfs/list.h>
+
+void
+kptllnd_peer_destroy (
+ kptl_peer_t *peer);
+
+kptl_peer_t *
+kptllnd_peer_find_locked (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid);
+
+
+
+int
+kptllnd_peer_create_locked (
+ kptl_data_t *kptllnd_data,
+ kptl_peer_t **peerp,
+ lnet_nid_t nid)
+{
+ kptl_peer_t *peer;
+ int rc;
+
+ PJK_UT_MSG(">>> nid="LPX64"\n",nid);
+
+ LASSERT (nid != PTL_NID_ANY);
+
+ /*
+ * But first check we haven't exceeded or maximum
+ * number of peers
+ */
+ if (atomic_read(&kptllnd_data->kptl_npeers) >=
+ *kptllnd_tunables.kptl_concurrent_peers) {
+ STAT_UPDATE(kps_too_many_peers);
+ CERROR("Can't create peer: too many peers\n");
+ rc = -EOVERFLOW; /* !! but at least it distinguishes */
+ }
+
+ PORTAL_ALLOC(peer, sizeof (*peer));
+ if (peer == NULL) {
+ CERROR("Cannot allocate memory for peer\n");
+ return -ENOMEM;
+ }
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ INIT_LIST_HEAD (&peer->peer_list); /* not in the peer table yet */
+ INIT_LIST_HEAD (&peer->peer_pending_txs);
+ INIT_LIST_HEAD (&peer->peer_active_txs);
+ spin_lock_init (&peer->peer_lock);
+
+
+ peer->peer_state = PEER_STATE_WAITING_HELLO;
+ peer->peer_kptllnd_data = kptllnd_data;
+ peer->peer_nid = nid;
+ //peer->peer_incarnation = 0;
+ //peer->peer_tx_seqnum = 0;
+
+ /*
+ * Just enough to send the connect message
+ */
+ peer->peer_credits = 1;
+
+ /*
+ * We just posted this many buffers ready for the peer
+ * to send into, so give back this many credits
+ */
+ peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1;
+
+
+ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+ //peer->peer_last_matchbits_seen = 0;
+
+
+ /*
+ * Reserve space in the RX buffer pool for this new peer
+ */
+ rc = kptllnd_rx_buffer_pool_reserve(
+ &kptllnd_data->kptl_rx_buffer_pool,
+ kptllnd_data,
+ *kptllnd_tunables.kptl_peercredits);
+ if(rc != 0){
+ CERROR("Cannot reserve rx buffer pool space\n");
+ PORTAL_FREE(peer, sizeof (*peer));
+ return rc;
+ }
+
+ /*
+ * 1 ref for the list
+ * 1 for the caller
+ */
+ atomic_set (&peer->peer_refcount, 2);
+
+ /* npeers only grows with the global lock held */
+ atomic_inc(&kptllnd_data->kptl_npeers);
+
+ /* And add this to the list */
+ list_add_tail (&peer->peer_list,
+ kptllnd_nid2peerlist (kptllnd_data,nid));
+
+ STAT_UPDATE(kps_peers_created);
+
+ PJK_UT_MSG("<<< Peer=%p nid="LPX64"\n",peer,nid);
+ *peerp = peer;
+ return 0;
+}
+
+
+void
+kptllnd_peer_destroy (
+ kptl_peer_t *peer)
+{
+ kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+
+ PJK_UT_MSG("Peer=%p\n",peer);
+
+ LASSERT (atomic_read (&peer->peer_refcount) == 0);
+ /* Not on the peer list */
+ LASSERT (list_empty (&peer->peer_list));
+ /* No pending tx descriptors */
+ LASSERT (list_empty (&peer->peer_pending_txs));
+ /* No active tx descriptors */
+ LASSERT (list_empty (&peer->peer_active_txs));
+
+ PORTAL_FREE (peer, sizeof (*peer));
+
+ kptllnd_rx_buffer_pool_unreserve(
+ &kptllnd_data->kptl_rx_buffer_pool,
+ *kptllnd_tunables.kptl_peercredits);
+
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec(&kptllnd_data->kptl_npeers);
+}
+
+
+void
+kptllnd_peer_addref (
+ kptl_peer_t *peer,
+ const char *owner)
+{
+ atomic_inc(&peer->peer_refcount);
+
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
+ atomic_read(&peer->peer_refcount));
+}
+
+void
+kptllnd_peer_decref (
+ kptl_peer_t *peer,
+ const char *owner)
+{
+ unsigned long flags;
+ kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+
+ if( !atomic_dec_and_test(&peer->peer_refcount)){
+
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner,
+ atomic_read(&peer->peer_refcount));
+ return;
+ }
+
+ PJK_UT_MSG("peer=%p owner=%s LAST REF\n",peer,owner);
+
+ write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+ list_del_init (&peer->peer_list);
+ if(peer->peer_state == PEER_STATE_CANCELED)
+ kptllnd_data->kptl_canceled_peers_counter++;
+ write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ kptllnd_peer_destroy(peer);
+}
+
+
+void
+kptllnd_peer_cancel_pending_txs(
+ kptl_peer_t *peer)
+{
+ struct list_head list;
+ struct list_head *tx_temp;
+ struct list_head *tx_next;
+ kptl_tx_t *tx;
+
+
+ INIT_LIST_HEAD (&list);
+
+ /*
+ * Tranfer all the PENDING TX's to a temporary list
+ * while holding the peer lock
+ */
+ spin_lock(&peer->peer_lock);
+
+ if(!list_empty(&peer->peer_pending_txs))
+ PJK_UT_MSG("Clearing Pending TXs\n");
+
+ list_for_each_safe (tx_temp, tx_next, &peer->peer_pending_txs) {
+ tx = list_entry (tx_temp, kptl_tx_t, tx_list);
+
+ list_del_init(&tx->tx_list);
+ list_add(&tx->tx_list,&list);
+ }
+
+ spin_unlock(&peer->peer_lock);
+
+ /*
+ * Now relese the refereces outside of the peer_lock
+ */
+ list_for_each_safe (tx_temp, tx_next, &list) {
+ tx = list_entry (tx_temp, kptl_tx_t, tx_list);
+ list_del_init(&tx->tx_list);
+ kptllnd_tx_decref(tx);
+ }
+}
+
+void
+kptllnd_peer_cancel_active_txs(
+ kptl_peer_t *peer)
+{
+ struct list_head *iter;
+ kptl_tx_t *tx;
+ ptl_err_t ptl_rc;
+ int counter;
+
+ spin_lock(&peer->peer_lock);
+
+ if(!list_empty(&peer->peer_active_txs))
+ PJK_UT_MSG("Clearing Active TXs\n");
+
+again:
+
+ counter = peer->peer_active_txs_change_counter;
+
+ list_for_each (iter, &peer->peer_active_txs) {
+ tx = list_entry (iter, kptl_tx_t, tx_list);
+
+ /*
+ * Hold onto one ref so we can make these
+ * unlink calls even though we have
+ * released the lock
+ */
+ kptllnd_tx_addref(tx);
+
+ spin_unlock(&peer->peer_lock);
+
+
+ /*
+ * Question: Why is it safe to acces tx_mdh and tx_mdh
+ * outside the peer_lock. We could be racing with
+ * tx_callback?
+ */
+
+ if(!PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)){
+ PJK_UT_MSG("Unlink mhd_msg\n");
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ ptl_rc = PtlMDUnlink(tx->tx_mdh_msg);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ if(ptl_rc == PTL_OK) {
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+ }
+#endif
+ }
+
+ if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+ PJK_UT_MSG("Unlink mdh\n");
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ ptl_rc = PtlMDUnlink(tx->tx_mdh);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ if(ptl_rc == PTL_OK){
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+ }
+#endif
+ }
+
+ kptllnd_tx_decref(tx);
+
+ spin_lock(&peer->peer_lock);
+
+ /*
+ * If a change in the list has be detected
+ * go back to the beginning
+ */
+ if( counter != peer->peer_active_txs_change_counter)
+ goto again;
+ }
+
+ spin_unlock(&peer->peer_lock);
+}
+
+void
+kptllnd_peer_cancel(
+ kptl_peer_t *peer)
+{
+ kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+ unsigned long flags;
+ int list_owns_ref=0;
+
+ PJK_UT_MSG(">>> Peer=%p\n",peer);
+
+ write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+ if(peer->peer_state != PEER_STATE_CANCELED){
+ peer->peer_state = PEER_STATE_CANCELED;
+ list_del_init(&peer->peer_list);
+ list_add(&peer->peer_list,&kptllnd_data->kptl_canceled_peers);
+ kptllnd_data->kptl_canceled_peers_counter++;
+ list_owns_ref = 1;
+ }
+ write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+
+ /*
+ * First cancel the pending and active TXs
+ */
+ kptllnd_peer_cancel_pending_txs(peer);
+ kptllnd_peer_cancel_active_txs(peer);
+
+
+ /* lose peerlist's ref as long as we haven't done
+ this before */
+ if(list_owns_ref)
+ kptllnd_peer_decref(peer,"list");
+
+ PJK_UT_MSG("<<< Peer=%p\n",peer);
+}
+
+int
+kptllnd_peer_del (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid)
+{
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kptl_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ unsigned long flags;
+ int rc = -ENOENT;
+
+
+ PJK_UT_MSG(">>> NID="LPX64"\n",nid);
+
+ /*
+ * Find the single bucket we are supposed to look at
+ * or if nid = PTL_NID_ANY then look at all of the buckets
+ */
+ if (nid != PTL_NID_ANY)
+ lo = hi = kptllnd_nid2peerlist(kptllnd_data,nid) - kptllnd_data->kptl_peers;
+ else {
+ lo = 0;
+ hi = kptllnd_data->kptl_peer_hash_size - 1;
+ }
+
+again:
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kptllnd_data->kptl_peers[i]) {
+ peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+ /*
+ * Is this the right one?
+ */
+ if (!(nid == PTL_NID_ANY || peer->peer_nid == nid))
+ continue;
+
+ kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,
+ flags);
+
+ kptllnd_peer_cancel(peer);
+ kptllnd_peer_decref(peer,"temp"); /* ...until here */
+
+ rc = 0; /* matched something */
+
+ /* start again now I've dropped the lock */
+ goto again;
+ }
+ }
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ PJK_UT_MSG("<<< rc=%d\n",rc);
+ return (rc);
+}
+
+void
+kptllnd_peer_queue_tx_locked (
+ kptl_peer_t *peer,
+ kptl_tx_t *tx)
+{
+ PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+
+ LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+ LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
+ tx->tx_state = TX_STATE_WAITING_CREDITS;
+ LASSERT(tx->tx_peer == NULL);
+
+ kptllnd_peer_addref(peer,"tx");
+ tx->tx_peer = peer;
+
+ tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+ list_add_tail(&tx->tx_list, &peer->peer_pending_txs);
+}
+
+void
+kptllnd_peer_queue_tx (
+ kptl_peer_t *peer,
+ kptl_tx_t *tx)
+{
+ spin_lock(&peer->peer_lock);
+ kptllnd_peer_queue_tx_locked (peer, tx);
+ spin_unlock(&peer->peer_lock);
+
+ kptllnd_peer_check_sends(peer);
+}
+
+
+void
+kptllnd_peer_queue_bulk_rdma_tx_locked(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx)
+{
+ PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx);
+
+ LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+ LASSERT(tx->tx_state == TX_STATE_ALLOCATED);
+ tx->tx_state = TX_STATE_WAITING_RESPONSE;
+
+ LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE ||
+ tx->tx_type == TX_TYPE_LARGE_GET_RESPONSE);
+
+ LASSERT(tx->tx_peer == NULL);
+ kptllnd_peer_addref(peer,"tx");
+ tx->tx_peer = peer;
+ tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+
+ list_add_tail(&tx->tx_list, &peer->peer_active_txs);
+ peer->peer_active_txs_change_counter++;
+}
+
+void
+kptllnd_peer_dequeue_tx_locked(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx)
+{
+ list_del_init(&tx->tx_list);
+ /*
+ * The tx could be on the active list
+ * or possibily the passive list. Either way
+ * we'll be safe an update the active txs list counter
+ * (this counter only indicates change, and in this
+ * case it's possible change, which is an acceptable
+ * usage)
+ */
+ peer->peer_active_txs_change_counter++;
+}
+
+void
+kptllnd_peer_dequeue_tx(
+ kptl_peer_t *peer,
+ kptl_tx_t *tx)
+{
+ spin_lock(&peer->peer_lock);
+ kptllnd_peer_dequeue_tx_locked(peer,tx);
+ spin_unlock(&peer->peer_lock);
+}
+
+void
+kptllnd_peer_check_sends (
+ kptl_peer_t *peer )
+{
+
+ kptl_tx_t *tx;
+ kptl_data_t *kptllnd_data = peer->peer_kptllnd_data;
+ int rc,rc2;
+ ptl_md_t md;
+ ptl_handle_me_t meh;
+ ptl_process_id_t target;
+
+ /*
+ * If there is nothing to send, and we have hit the credit
+ * high water mark, then send a no-op message
+ */
+ spin_lock(&peer->peer_lock);
+
+ PJK_UT_MSG_DATA(">>>Peer=%p Credits=%d Outstanding=%d\n",
+ peer,peer->peer_credits,peer->peer_outstanding_credits);
+
+ if(list_empty(&peer->peer_pending_txs) &&
+ peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) {
+
+ /*
+ * Get an idle tx descriptor
+ * may NOT block: (That's the "0" param)
+ */
+ tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+ if( tx == NULL ) {
+ CERROR ("Can't return credits to "LPX64": tx descs exhausted\n",
+ peer->peer_nid);
+ }else{
+ kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP,0);
+ kptllnd_peer_queue_tx_locked(peer,tx);
+ }
+ }
+ /*
+ * Now go through all the sends to see what we can send
+ */
+ while(!list_empty(&peer->peer_pending_txs)) {
+ tx = list_entry (peer->peer_pending_txs.next, kptl_tx_t, tx_list);
+
+ LASSERT (tx->tx_state == TX_STATE_WAITING_CREDITS);
+ LASSERT (peer->peer_outstanding_credits >= 0);
+ LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits);
+ LASSERT (peer->peer_credits >= 0);
+ LASSERT (peer->peer_credits <= *kptllnd_tunables.kptl_peercredits);
+
+ /*
+ * If there are no credits we're done
+ */
+ if (peer->peer_credits == 0) {
+ STAT_UPDATE(kps_no_credits);
+ CDEBUG(D_NET, LPX64": no credits\n",peer->peer_nid);
+ break;
+ }
+
+
+ /*
+ * If there is one credit but we have no credits to give
+ * back then we don't use our one credit and we are done
+ */
+ if (peer->peer_credits == 1 &&
+ peer->peer_outstanding_credits == 0) {
+ STAT_UPDATE(kps_saving_last_credit);
+ CDEBUG(D_NET, LPX64": not using last credit\n",
+ peer->peer_nid);
+ break;
+ }
+
+ /*
+ * Remove the tx from the list. We don't decrement the
+ * ref count here. The reference is simply transferred from
+ * the Peer to this calling function, and it will be this
+ * functions responsibility to dispose of the reference properly
+ */
+ list_del_init(&tx->tx_list);
+
+ /*
+ * If there is a NOOP in the queue but there
+ * are pending tx buffers also in the queue
+ *
+ * OR we are not at the high-water mark anymore
+ *
+ * THEN it is safe to simply discard this NOOP
+ * and continue one.
+ */
+ if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
+ (!list_empty(&peer->peer_pending_txs) ||
+ peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
+ kptllnd_tx_decref(tx);
+ CDEBUG(D_NET, LPX64": redundant noop\n",
+ peer->peer_nid);
+ continue;
+ }
+
+ PJK_UT_MSG_DATA("--- TXTXTXTXTXTXTXTXTXTXTXTXTXTX\n");
+ PJK_UT_MSG_DATA("Sending TX=%p Size=%d\n",tx,tx->tx_msg->ptlm_nob);
+ PJK_UT_MSG_DATA("Target nid="LPX64"\n",peer->peer_nid);
+
+
+ /*
+ * Assign matchbits for a put/get
+ */
+ if(tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT ||
+ tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET){
+
+ PJK_UT_MSG_DATA("next matchbits="LPX64" (before)\n",
+ peer->peer_next_matchbits);
+
+
+ /* Allocate a new match bits value. It might not be needed,
+ * but we've got a lock right now and we're unlikely to
+ * wrap...
+ *
+ * A set of match bits at the low end are reserved. So we can
+ * not use them. Just skip over them. This check protects us
+ * even in the case of 64-bit rollover.
+ */
+ if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS){
+ CDEBUG(D_INFO,"Match Bits Rollover for "LPX64"\n",
+ peer->peer_nid);
+ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+
+ }
+
+ /*
+ * Set the payload match bits and update the peer's counter
+ */
+ tx->tx_msg->ptlm_u.req.kptlrm_matchbits =
+ peer->peer_next_matchbits ++;
+
+ PJK_UT_MSG_DATA("next matchbits="LPX64" (after)\n",
+ peer->peer_next_matchbits);
+ }
+
+ /*
+ * Complete the message fill in all the rest
+ * of the header
+ */
+ kptllnd_msg_pack(
+ tx->tx_msg,
+ peer->peer_outstanding_credits,
+ peer->peer_nid,
+ peer->peer_incarnation,
+ peer->peer_tx_seqnum,
+ kptllnd_data);
+
+ /*
+ * We just sent a packet
+ */
+ peer->peer_tx_seqnum++;
+
+ /*
+ * And we've returned all of our credits
+ */
+ peer->peer_outstanding_credits = 0;
+
+ /*
+ * And we have one less credit :-(
+ */
+ peer->peer_credits--;
+
+ /*
+ * Set the state before the PtlPut() because
+ * we could get the PUT_END callback before PtlPut()
+ * returns.
+ */
+ LASSERT(tx->tx_state == TX_STATE_WAITING_CREDITS);
+ tx->tx_state = TX_STATE_WAITING_RESPONSE;
+
+ /*
+ * Construct an address that Portals needs from the NID
+ */
+
+ target.nid = lnet2ptlnid(kptllnd_data,peer->peer_nid);
+ target.pid = 0;
+
+ PJK_UT_MSG_DATA("Msg NOB = %d\n",tx->tx_msg->ptlm_nob);
+ PJK_UT_MSG_DATA("Returned Credits=%d\n",tx->tx_msg->ptlm_credits);
+ PJK_UT_MSG_DATA("Seq # = "LPX64"\n",tx->tx_msg->ptlm_seq);
+
+ PJK_UT_MSG("lnet TX nid=" LPX64 "\n",peer->peer_nid);
+ PJK_UT_MSG("ptl TX nid=" LPX64 "\n",target.nid);
+
+ if(tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET ||
+ tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT){
+ tempiov_t tempiov;
+
+#ifdef TESTING_WITH_LOOPBACK
+ /*
+ * When doing loopback testing the data comes back
+ * on the given loopback nid
+ */
+ ptl_process_id_t target;
+ target.nid = PTL_NID_ANY;
+ target.pid = 0;
+#endif
+
+ PJK_UT_MSG_DATA("matchibts=" LPX64 "\n",
+ tx->tx_msg->ptlm_u.req.kptlrm_matchbits);
+
+
+ /*
+ * Attach the ME
+ */
+ rc = PtlMEAttach(
+ kptllnd_data->kptl_nih,
+ *kptllnd_tunables.kptl_portal,
+ target,
+ tx->tx_msg->ptlm_u.req.kptlrm_matchbits,
+ 0, /* all matchbits are valid - ignore none*/
+ PTL_UNLINK,
+ PTL_INS_BEFORE,
+ &meh);
+ if(rc != 0) {
+ CERROR("PtlMeAttach failed %d\n",rc);
+ goto failed;
+ }
+
+ /* Setup the MD */
+ kptllnd_setup_md(kptllnd_data,&md,
+ tx->tx_msg->ptlm_type == LNET_MSG_GET ? PTL_MD_OP_PUT :
+ PTL_MD_OP_GET,
+ tx,
+ tx->tx_payload_niov,
+ tx->tx_payload_iov,
+ tx->tx_payload_kiov,
+ tx->tx_payload_offset,
+ tx->tx_payload_nob,
+ &tempiov);
+
+ /*
+ * Add a ref for this MD, because unlink
+ * events can happen at any time once
+ * something is posted.
+ */
+ kptllnd_tx_addref(tx);
+
+ /*
+ * Attach the MD
+ */
+ rc = PtlMDAttach(
+ meh,
+ md,
+ PTL_UNLINK,
+ &tx->tx_mdh);
+ if(rc != 0){
+ CERROR("PtlMDAttach failed %d\n",rc);
+
+ /*
+ * Just drop the ref for this MD because it was never
+ * posted to portals
+ */
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+
+ rc2 = PtlMEUnlink(meh);
+ LASSERT(rc2 == 0);
+ goto failed;
+ }
+ }
+
+
+ /*
+ * Setup the MD
+ */
+ md.start = tx->tx_msg;
+ md.length = tx->tx_msg->ptlm_nob;
+ md.threshold = 1;
+ md.options = PTL_MD_OP_PUT;
+ md.options |= PTL_MD_EVENT_START_DISABLE;
+ /* we don't need an ACK, we'll get a callback when the get is complete */
+ md.options |= PTL_MD_ACK_DISABLE;
+ md.user_ptr = tx;
+ md.eq_handle = kptllnd_data->kptl_eqh;
+
+
+ /*
+ * Bind the MD
+ */
+ rc = PtlMDBind (
+ kptllnd_data->kptl_nih,
+ md,
+ PTL_UNLINK,
+ &tx->tx_mdh_msg);
+ if(rc != 0){
+ CERROR("PtlMDBind failed %d\n",rc);
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+ goto failed;
+ }
+
+ list_add_tail(&tx->tx_list, &peer->peer_active_txs);
+ peer->peer_active_txs_change_counter++;
+ LASSERT(tx->tx_peer == peer);
+
+ /*
+ * Grab a ref so the TX doesn't go away
+ * if we fail.
+ */
+ kptllnd_tx_addref(tx);
+
+ spin_unlock(&peer->peer_lock);
+
+ rc = PtlPut (
+ tx->tx_mdh_msg,
+ PTL_NOACK_REQ, /* we dont need an ack */
+ target, /* peer "address" */
+ *kptllnd_tunables.kptl_portal, /* portal */
+ 0, /* cookie */
+ LNET_MSG_MATCHBITS, /* match bits */
+ 0, /* offset */
+ 0); /* header data */
+ if(rc != 0){
+ CERROR("PtlPut error %d\n",rc);
+
+ /*
+ * Do the unlink which should succeed
+ */
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ rc2 = PtlMDUnlink(tx->tx_mdh_msg);
+ LASSERT( rc2 == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+#endif
+ goto failed;
+ }
+
+ /*
+ * Release our temporary reference
+ */
+ kptllnd_tx_decref(tx);
+
+ spin_lock(&peer->peer_lock);
+
+ }
+
+
+ spin_unlock(&peer->peer_lock);
+
+ PJK_UT_MSG_DATA("<<<\n");
+ return;
+
+failed:
+
+ /*
+ * Now unlink the MDs (if they were posted)
+ */
+ if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ rc2 = PtlMDUnlink(tx->tx_mdh);
+ /*
+ * The unlink should succeed
+ */
+ LASSERT( rc2 == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ kptllnd_tx_decref(tx);
+#endif
+ }
+
+ /*
+ * Get back the credits
+ * ??? WHY even do this because we're killing the peer
+ */
+ peer->peer_outstanding_credits += tx->tx_msg->ptlm_credits;
+ peer->peer_credits++;
+
+ spin_unlock(&peer->peer_lock);
+
+ /*
+ * And cleanup this peer
+ */
+ kptllnd_peer_cancel(peer);
+
+ /*
+ * And release the tx reference
+ */
+ kptllnd_tx_decref(tx);
+
+ PJK_UT_MSG("<<< FAILED\n");
+}
+
+int
+kptllnd_peer_timedout(kptl_peer_t *peer)
+{
+ kptl_tx_t *tx;
+
+ spin_lock(&peer->peer_lock);
+
+ /*
+ * Check the head of the pending list for expiration
+ * this is a queue, so if the head isn't expired then nothing
+ * else will be expired
+ */
+ if(!list_empty(&peer->peer_pending_txs)){
+ tx = list_entry(peer->peer_pending_txs.next,kptl_tx_t,tx_list);
+ if(time_after_eq(jiffies,tx->tx_deadline)){
+ spin_unlock(&peer->peer_lock);
+ PJK_UT_MSG("Peer=%p PENDING tx=%p time=%lu sec\n",
+ peer,tx,(jiffies - tx->tx_deadline)/HZ);
+ return 1;
+ }
+ }
+
+ /*
+ * Check the head of the active list
+ */
+ if(!list_empty(&peer->peer_active_txs)){
+ tx = list_entry(peer->peer_active_txs.next,kptl_tx_t,tx_list);
+ if(time_after_eq(jiffies,tx->tx_deadline)){
+ spin_unlock(&peer->peer_lock);
+ PJK_UT_MSG("Peer=%p ACTIVE tx=%p time=%lu sec\n",
+ peer,tx,(jiffies - tx->tx_deadline)/HZ);
+ return 1;
+ }
+ }
+
+ spin_unlock(&peer->peer_lock);
+ return 0;
+}
+
+
+void
+kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data)
+{
+ struct list_head *peers = &kptllnd_data->kptl_peers[idx];
+ struct list_head *ptmp;
+ kptl_peer_t *peer;
+ unsigned long flags;
+
+
+ /*PJK_UT_MSG("Bucket=%d\n",idx);*/
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * rdmas to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ list_for_each (ptmp, peers) {
+ peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ kptllnd_peer_check_sends(peer);
+
+ if (!kptllnd_peer_timedout(peer))
+ continue;
+
+ kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,
+ flags);
+
+ CERROR("Timed out RDMA with "LPX64"\n",peer->peer_nid);
+
+ kptllnd_peer_cancel(peer);
+ kptllnd_peer_decref(peer,"temp"); /* ...until here */
+
+ /* start again now I've dropped the lock */
+ goto again;
+ }
+
+ read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+}
+
+kptl_peer_t *
+kptllnd_peer_find (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid)
+{
+ kptl_peer_t *peer;
+ unsigned long flags;
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+ peer = kptllnd_peer_find_locked(kptllnd_data,nid);
+ read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+ return peer;
+}
+
+kptl_peer_t *
+kptllnd_peer_find_locked (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid)
+{
+ struct list_head *peer_list = kptllnd_nid2peerlist (kptllnd_data,nid);
+ struct list_head *tmp;
+ kptl_peer_t *peer;
+
+ PJK_UT_MSG(">>> nid="LPX64"\n",nid);
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+ LASSERT(peer->peer_state != PEER_STATE_CANCELED);
+
+ if (peer->peer_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+ peer, nid, atomic_read (&peer->peer_refcount));
+
+ kptllnd_peer_addref(peer,"find");
+ PJK_UT_MSG("<<< Peer=%p\n",peer);
+ return peer;
+ }
+
+ PJK_UT_MSG("<<< NOTFOUND\n");
+ return NULL;
+}
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (
+ kptl_data_t *kptllnd_data,
+ lnet_nid_t nid,
+ kptl_msg_t *msg)
+{
+ kptl_peer_t *peer;
+ kptl_peer_t *peer_to_cancel = 0;
+ unsigned long flags;
+ kptl_tx_t *tx_hello = 0;
+ int rc;
+ __u64 safe_matchbits_from_peer;
+ __u64 safe_matchbits_to_peer = 0;
+
+
+ PJK_UT_MSG(">>>\n");
+
+ safe_matchbits_from_peer = msg->ptlm_u.hello.kptlhm_matchbits +
+ *kptllnd_tunables.kptl_peercredits;
+
+ /*
+ * Immediate message sizes MUST be equal
+ */
+ if( msg->ptlm_u.hello.kptlhm_max_immd_size !=
+ *kptllnd_tunables.kptl_max_immd_size){
+ CERROR("IMMD message size MUST be equal for all peers got %d expected %d\n",
+ msg->ptlm_u.hello.kptlhm_max_immd_size,
+ *kptllnd_tunables.kptl_max_immd_size);
+
+ return 0;
+ }
+
+ write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags);
+
+ /*
+ * Look for peer because it could have been previously here
+ */
+ peer = kptllnd_peer_find_locked(kptllnd_data,nid);
+
+ /*
+ * If peer is already here
+ */
+ if(peer != NULL){
+
+ if(peer->peer_incarnation == 0) {
+ /*
+ * Update the peer state
+ */
+ LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO);
+ peer->peer_state = PEER_STATE_ACTIVE;
+
+ /*
+ * Update the incarnation
+ */
+ peer->peer_incarnation = msg->ptlm_srcstamp;
+
+ /*
+ * Save the match bits
+ */
+ PJK_UT_MSG_DATA(" **** Updating Matchbits="LPX64" ****\n",
+ safe_matchbits_from_peer);
+
+ peer->peer_next_matchbits = safe_matchbits_from_peer;
+ if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
+ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+ }
+
+ /*
+ * If the incarnation has changed then we need to
+ * resend the hello.
+ */
+ else if( peer->peer_incarnation != msg->ptlm_srcnid ) {
+
+ /*
+ * Put the match bits into the hello message
+ */
+ safe_matchbits_to_peer =
+ peer->peer_last_matchbits_seen + 1 +
+ *kptllnd_tunables.kptl_peercredits;
+
+ /*
+ * Save this peer to cancel
+ */
+ peer_to_cancel = peer;
+ peer = NULL;
+
+ }else{
+ CERROR("Receiving HELLO message on already connected peer " LPX64"\n",nid);
+ }
+ }
+
+ if( peer == NULL) {
+
+ /*
+ * Setup a connect HELLO message. We ultimately might not
+ * use it but likely we will.
+ */
+ tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+ if( tx_hello == NULL) {
+ CERROR("Unable to allocate connect message for "LPX64"\n",nid);
+ goto failed;
+ }
+
+ kptllnd_init_msg(
+ tx_hello->tx_msg,
+ PTLLND_MSG_TYPE_HELLO,
+ sizeof(kptl_hello_msg_t));
+ /*
+ * Put the match bits into the hello message
+ */
+ tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits =
+ safe_matchbits_to_peer;
+ tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size =
+ *kptllnd_tunables.kptl_max_immd_size;
+
+ rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, nid);
+ if(rc != 0){
+ CERROR("Failed to create peer (nid="LPX64")\n",nid);
+ write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags);
+ peer = NULL;
+ goto failed;
+ }
+
+ LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO);
+ peer->peer_state = PEER_STATE_ACTIVE;
+
+ /*
+ * NB We don't need to hold the peer->peer_lock
+ * because we haven't released the kptl_peer_rw_lock which
+ * holds prevents anyone else from getting a pointer to
+ * this newly created peer
+ */
+
+ /*
+ * Update the incarnation
+ */
+ peer->peer_incarnation = msg->ptlm_srcstamp;
+
+ /*
+ * Save the match bits
+ */
+ PJK_UT_MSG_DATA("**** Setting Matchbits="LPX64" ****\n",
+ safe_matchbits_from_peer);
+ peer->peer_next_matchbits = safe_matchbits_from_peer;
+ if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS)
+ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+
+
+ /*
+ * And save them from a previous incarnation
+ */
+ peer->peer_last_matchbits_seen = safe_matchbits_to_peer;
+
+ /*
+ * Queue the message
+ */
+ kptllnd_peer_queue_tx_locked(peer,tx_hello);
+
+ /*
+ * And don't free it because it's queued
+ */
+ tx_hello = 0;
+
+ }
+
+failed:
+ write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,flags);
+
+ if(tx_hello)
+ kptllnd_tx_decref(tx_hello);
+
+ /*
+ *
+ */
+ if(peer){
+ kptllnd_peer_check_sends(peer);
+ }
+
+ if(peer_to_cancel) {
+ kptllnd_peer_cancel(peer_to_cancel);
+ kptllnd_peer_decref(peer_to_cancel,"find");
+ }
+
+ PJK_UT_MSG("<<< Peer=%p\n",peer);
+
+ return peer;
+}
+
+void
+kptllnd_tx_launch (
+ kptl_tx_t *tx,
+ lnet_nid_t target_nid,
+ lnet_msg_t *ptlmsg )
+{
+ kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
+ kptl_peer_t *peer;
+ unsigned long flags;
+ rwlock_t *g_lock = &kptllnd_data->kptl_peer_rw_lock;
+ int rc;
+ kptl_tx_t *tx_hello;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ PJK_UT_MSG(">>> TX=%p nid="LPX64"\n",tx,target_nid);
+
+ LASSERT (tx->tx_ptlmsg == NULL);
+ tx->tx_ptlmsg = ptlmsg; /* finalize ptlmsg on completion */
+
+ LASSERT (tx->tx_peer == NULL); /* only set when assigned a peer */
+
+
+ /*
+ * First try to find the peer (this will grab the
+ * read lock
+ */
+ peer = kptllnd_peer_find (kptllnd_data,target_nid);
+
+ /*
+ * If we find the peer
+ * then just queue the tx
+ * (which could send it)
+ */
+ if (peer != NULL) {
+ kptllnd_peer_queue_tx ( peer, tx );
+ kptllnd_peer_decref(peer,"find");
+ PJK_UT_MSG("<<< FOUND\n");
+ return;
+ }
+
+
+ /*
+ * Since we didn't find the peer
+ * Setup a HELLO message. We ultimately might not use it
+ * (in the case that the peer is racing to connect with us)
+ * but more than likely we will.
+ */
+ tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE);
+ if( tx_hello == NULL) {
+ CERROR("Unable to allocate connect message for "LPX64"\n",target_nid);
+ kptllnd_tx_decref (tx);
+ kptllnd_peer_decref(peer,"find");
+ return;
+ }
+
+ kptllnd_init_msg(
+ tx_hello->tx_msg,
+ PTLLND_MSG_TYPE_HELLO,
+ sizeof(kptl_hello_msg_t));
+
+
+ /*
+ * Now try again with the exclusive lock
+ * so if it's not found we'll add it
+ */
+ write_lock_irqsave(g_lock, flags);
+
+ peer = kptllnd_peer_find_locked (kptllnd_data,target_nid);
+
+ /*
+ * If we find the peer
+ * then just queue the tx
+ * (which could send it)
+ */
+ if (peer != NULL) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CDEBUG(D_TRACE,"HELLO message race occurred (nid="LPX64")\n",target_nid);
+
+ kptllnd_peer_queue_tx ( peer, tx );
+ kptllnd_peer_decref(peer,"find");
+
+ /* and we don't need the connection tx*/
+ kptllnd_tx_decref(tx_hello);
+
+ PJK_UT_MSG("<<< FOUND2\n");
+ return;
+ }
+
+ PJK_UT_MSG("TX %p creating NEW PEER nid="LPX64"\n",tx,target_nid);
+ rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, target_nid);
+ if(rc != 0){
+ CERROR("Failed to create peer (nid="LPX64")\n",target_nid);
+ write_unlock_irqrestore(g_lock, flags);
+ kptllnd_tx_decref (tx);
+ kptllnd_tx_decref (tx_hello);
+ kptllnd_peer_decref(peer,"find");
+ return;
+ }
+
+
+ /*
+ * We've never seen this peer before. So setup
+ * a default message.
+ */
+ tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits = 0;
+ tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size =
+ *kptllnd_tunables.kptl_max_immd_size;
+
+ /*
+ * Queue the connection request
+ * and the actually tx. We have one credit so
+ * the connection request will go out, and
+ * the tx will wait for a reply.
+ */
+ PJK_UT_MSG("TXHello=%p\n",tx_hello);
+ kptllnd_peer_queue_tx_locked(peer,tx_hello);
+ kptllnd_peer_queue_tx_locked(peer,tx);
+
+ write_unlock_irqrestore(g_lock,flags);
+
+ kptllnd_peer_check_sends(peer);
+ kptllnd_peer_decref(peer,"find");
+
+ PJK_UT_MSG("<<<\n");
+}
--- /dev/null
+#include "ptllnd.h"
+
+kptl_rx_t*
+kptllnd_rx_alloc(
+ kptl_data_t *kptllnd_data );
+
+void
+kptllnd_rx_schedule (kptl_rx_t *rx);
+
+void
+kptllnd_rx_buffer_destroy(
+ kptl_rx_buffer_t *rxb);
+int
+kptllnd_rx_buffer_post(
+ kptl_rx_buffer_t *rxb);
+
+void
+kptllnd_rx_buffer_addref(
+ kptl_rx_buffer_t *rxb,
+ const char *owner);
+
+void
+kptllnd_rx_buffer_pool_init(
+ kptl_rx_buffer_pool_t *rxbp)
+{
+ PJK_UT_MSG("kptllnd_rx_buffer_pool_init\n");
+ memset(rxbp,0,sizeof(*rxbp));
+
+ spin_lock_init (&rxbp->rxbp_lock);
+ INIT_LIST_HEAD (&rxbp->rxbp_list);
+
+}
+
+void
+kptllnd_rx_buffer_pool_fini(
+ kptl_rx_buffer_pool_t *rxbp)
+{
+ kptl_rx_buffer_t *rxb;
+ int rc;
+ int i;
+
+ PJK_UT_MSG("kptllnd_rx_buffer_pool_fini\n");
+
+ spin_lock(&rxbp->rxbp_lock);
+
+ /*
+ * Set the shutdown flag under the lock
+ */
+ rxbp->rxbp_shutdown = 1;
+
+ i = 2;
+ while(!list_empty(&rxbp->rxbp_list))
+ {
+ struct list_head* iter;
+ int count = 0;
+
+ /*
+ * Count how many items are on the list right now
+ */
+ list_for_each(iter,&rxbp->rxbp_list)
+ ++count;
+
+ CDEBUG(D_TRACE,"|rxbp_list|=%d\n",count);
+
+ /*
+ * Loop while we still have items on the list
+ * ore we've going through the list once
+ */
+ while(!list_empty(&rxbp->rxbp_list) && count!=0)
+ {
+ --count;
+ rxb = list_entry (rxbp->rxbp_list.next,
+ kptl_rx_buffer_t, rxb_list);
+
+ LASSERT(rxb->rxb_state == RXB_STATE_POSTED);
+
+
+ list_del_init(&rxb->rxb_list);
+
+ /*
+ * We have hit the one race where the MD has been put
+ * on the list, but the MD is not created.
+ */
+ if(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE)){
+ list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+ continue;
+ }
+
+
+ /*
+ * Keep the RXB from being deleted
+ */
+ kptllnd_rx_buffer_addref(rxb,"temp");
+
+ spin_unlock(&rxbp->rxbp_lock);
+
+ /*
+ * Unlinked the MD
+ */
+ LASSERT(atomic_read(&rxb->rxb_refcount)>1);
+ rc = PtlMDUnlink(rxb->rxb_mdh);
+ if(rc == 0){
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ rxb->rxb_mdh = PTL_INVALID_HANDLE;
+ kptllnd_rx_buffer_decref(rxb,"portals");
+#endif
+ /*
+ * Drop the reference we took above
+ */
+ kptllnd_rx_buffer_decref(rxb,"temp");
+
+ spin_lock(&rxbp->rxbp_lock);
+ }else{
+ PJK_UT_MSG("PtlMDUnlink(%p) rc=%d\n",rxb,rc);
+ /*
+ * The unlinked failed so put this back
+ * on the list for later
+ */
+ spin_lock(&rxbp->rxbp_lock);
+
+ list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+
+ /*
+ * Drop the reference we took above
+ */
+ kptllnd_rx_buffer_decref(rxb,"temp");
+ }
+ }
+
+ /*
+ * If there are still items on the list we
+ * need to take a break, and let the Busy RX's
+ * finish up.
+ */
+ if(!list_empty(&rxbp->rxbp_list)){
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d Busy RX Buffers\n",
+ rxbp->rxbp_count);
+ spin_unlock(&rxbp->rxbp_lock);
+ cfs_pause(cfs_time_seconds(1));
+ spin_lock(&rxbp->rxbp_lock);
+ }
+ }
+
+ CDEBUG(D_TRACE,"|rxbp_list|=EMPTY\n");
+
+ if(rxbp->rxbp_count != 0){
+ PJK_UT_MSG("Waiting for %d RX Buffers to unlink\n",rxbp->rxbp_count);
+
+ i = 2;
+ while (rxbp->rxbp_count != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d RX Buffers to unlink\n",
+ rxbp->rxbp_count);
+ spin_unlock(&rxbp->rxbp_lock);
+ cfs_pause(cfs_time_seconds(1));
+ spin_lock(&rxbp->rxbp_lock);
+ }
+ }
+
+ CDEBUG(D_TRACE,"|rxbp_count|=0\n");
+
+ spin_unlock(&rxbp->rxbp_lock);
+}
+
+
+int
+kptllnd_rx_buffer_pool_reserve(
+ kptl_rx_buffer_pool_t *rxbp,
+ kptl_data_t *kptllnd_data,
+ int count)
+{
+ int add = 0;
+ int i;
+ int rc;
+ kptl_rx_buffer_t *rxb;
+ int nbuffers;
+
+ spin_lock(&rxbp->rxbp_lock);
+
+ PJK_UT_MSG("kptllnd_rx_buffer_pool_reserve(%d)\n",count);
+
+ /*
+ * Prevent reservation of anymore while we are shutting down
+ */
+ if(rxbp->rxbp_shutdown){
+ spin_unlock(&rxbp->rxbp_lock);
+ return -ESHUTDOWN;
+ }
+
+ /*
+ * Make the reservation
+ */
+ rxbp->rxbp_reserved += count;
+
+ /*
+ * Calcuate the number or buffers we need
+ * +1 to handle any rounding error
+ */
+ nbuffers = (rxbp->rxbp_reserved) *
+ (*kptllnd_tunables.kptl_max_immd_size) /
+ (PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages));
+ ++nbuffers ;
+
+ PJK_UT_MSG("nbuffers=%d rxbp_count=%d\n",nbuffers,rxbp->rxbp_count);
+
+ if(rxbp->rxbp_count < nbuffers)
+ add = nbuffers - rxbp->rxbp_count;
+
+ PJK_UT_MSG("adding=%d\n",add);
+
+ /*
+ * Under the same lock assume they are added
+ * we'll subtract if we hit an error.
+ */
+ rxbp->rxbp_count += add;
+ spin_unlock(&rxbp->rxbp_lock);
+
+ for(i=0;i<add;i++){
+ PORTAL_ALLOC( rxb,sizeof(*rxb));
+ if(rxb == NULL){
+ CERROR("Failed to allocate data rxb%d\n",i);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ memset(rxb,0,sizeof(*rxb));
+
+ kptllnd_posted_object_setup(&rxb->rxb_po,
+ kptllnd_data,
+ POSTED_OBJECT_TYPE_RXB);
+
+ rxb->rxb_pool = rxbp;
+ rxb->rxb_state = RXB_STATE_IDLE;
+ rxb->rxb_mdh = PTL_INVALID_HANDLE;
+ INIT_LIST_HEAD (&rxb->rxb_list);
+ INIT_LIST_HEAD (&rxb->rxb_repost_list);
+
+ PORTAL_ALLOC( rxb->rxb_buffer,
+ PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+ if(rxb->rxb_buffer == NULL) {
+ CERROR("Failed to allocate data buffer or size %d pages for rx%d\n",
+ *kptllnd_tunables.kptl_rxb_npages,i);
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = kptllnd_rx_buffer_post(rxb);
+ if(rc != 0)
+ goto failed;
+ }
+ return 0;
+
+failed:
+ spin_lock(&rxbp->rxbp_lock);
+
+ /*
+ * We really didn't add as many
+ * as we were planning to.
+ */
+ rxbp->rxbp_count -= add - i;
+
+ /*
+ * Cancel this reservation
+ */
+ rxbp->rxbp_reserved -= count;
+ spin_unlock(&rxbp->rxbp_lock);
+
+
+ if(rxb){
+ if(rxb->rxb_buffer)
+ PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+ PORTAL_FREE( rxb,sizeof(*rxb));
+ }
+
+ return rc;
+}
+
+void
+kptllnd_rx_buffer_pool_unreserve(
+ kptl_rx_buffer_pool_t *rxbp,
+ int count)
+{
+ spin_lock(&rxbp->rxbp_lock);
+ PJK_UT_MSG("kptllnd_rx_buffer_pool_unreserve(%d)\n",count);
+ rxbp->rxbp_reserved -= count;
+ spin_unlock(&rxbp->rxbp_lock);
+}
+
+void
+kptllnd_rx_buffer_scheduled_post(
+ kptl_rx_buffer_t *rxb)
+{
+ kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+ unsigned long flags;
+
+ PJK_UT_MSG("rxb=%p\n",rxb);
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+ LASSERT(list_empty(&rxb->rxb_repost_list));
+ list_add_tail(&rxb->rxb_repost_list,&kptllnd_data->kptl_sched_rxbq);
+ wake_up(&kptllnd_data->kptl_sched_waitq);
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+
+int
+kptllnd_rx_buffer_post(
+ kptl_rx_buffer_t *rxb)
+{
+ int rc;
+ ptl_md_t md;
+ ptl_handle_me_t meh;
+ ptl_handle_md_t mdh;
+ ptl_process_id_t any;
+ kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+ kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+
+ any.nid = PTL_NID_ANY;
+ any.pid = PTL_PID_ANY;
+
+ /*PJK_UT_MSG("rxb=%p\n",rxb);*/
+
+ spin_lock(&rxbp->rxbp_lock);
+
+ /*
+ * No new RXB's can enter the POSTED state
+ */
+ if(rxbp->rxbp_shutdown){
+ spin_unlock(&rxbp->rxbp_lock);
+ return -ESHUTDOWN;
+ }
+
+ LASSERT(!in_interrupt());
+
+ LASSERT(rxb->rxb_state == RXB_STATE_IDLE);
+ LASSERT(atomic_read(&rxb->rxb_refcount)==0);
+ LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE));
+
+ list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list);
+ atomic_set(&rxb->rxb_refcount,1);
+ rxb->rxb_state = RXB_STATE_POSTED;
+
+ spin_unlock(&rxbp->rxbp_lock);
+
+ /*
+ * Attach the ME
+ */
+ rc = PtlMEAttach(
+ kptllnd_data->kptl_nih,
+ *kptllnd_tunables.kptl_portal,
+ any,
+ LNET_MSG_MATCHBITS,
+ 0, /* all matchbits are valid - ignore none*/
+ PTL_UNLINK,
+ PTL_INS_AFTER,
+ &meh);
+ if(rc != 0) {
+ CERROR("PtlMeAttach rxb failed %d\n",rc);
+ goto failure;
+ }
+
+ /*
+ * Setup MD
+ */
+ md.start = rxb->rxb_buffer;
+ md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages;
+ md.threshold = PTL_MD_THRESH_INF;
+ md.options = PTL_MD_OP_PUT;
+ md.options |= PTL_MD_EVENT_START_DISABLE;
+ md.options |= PTL_MD_MAX_SIZE;
+ md.user_ptr = rxb;
+ md.max_size = *kptllnd_tunables.kptl_max_immd_size;
+ md.eq_handle = kptllnd_data->kptl_eqh;
+
+
+ /*
+ * Attach the MD
+ */
+ rc = PtlMDAttach(
+ meh,
+ md,
+ PTL_UNLINK,
+ &mdh);
+ if(rc != 0){
+ int rc2;
+ CERROR("PtlMDAttach rxb failed %d\n",rc);
+ rc2 = PtlMEUnlink(meh);
+ LASSERT(rc2 == 0);
+ goto failure;
+ }
+
+ /*
+ * Assign the MDH under the lock
+ * to deal with shutdown race, of
+ * a partially constructed rbx
+ */
+ spin_lock(&rxbp->rxbp_lock);
+ rxb->rxb_mdh = mdh;
+ spin_unlock(&rxbp->rxbp_lock);
+
+ return 0;
+
+
+failure:
+ /*
+ * Cleanup on error
+ */
+ spin_lock(&rxbp->rxbp_lock);
+ list_del_init(&rxb->rxb_list);
+ atomic_set(&rxb->rxb_refcount,0);
+ rxb->rxb_state = RXB_STATE_IDLE;
+ spin_unlock(&rxbp->rxbp_lock);
+
+ return rc;
+}
+
+void
+kptllnd_rx_buffer_post_handle_error(
+ kptl_rx_buffer_t *rxb)
+{
+ int rc;
+ rc = kptllnd_rx_buffer_post(rxb);
+ if(rc!=0){
+ /* Don't log on shutdown */
+ if(rc != -ESHUTDOWN)
+ CERROR("Failing to Repost buffer rc=%d\n",rc);
+
+ kptllnd_rx_buffer_destroy(rxb);
+ /* Should I destroy the peer?
+ * I don't think so. But this now
+ * now means there is some chance
+ * under very heavy load that we will drop a packet.
+ * On the other hand, if there is more buffers in
+ * the pool that are reserved this won't happen.
+ * And secondly under heavly load it is liklye a
+ * a new peer will be added added, the reservation
+ * for the ones that were lost will
+ * get new backing buffers at that time.
+ *
+ * So things are starting to get bad, but
+ * in all likelihood things will be fine,
+ * and even better they might correct themselves
+ * in time.
+ */
+ }
+}
+
+void
+kptllnd_rx_buffer_destroy(
+ kptl_rx_buffer_t *rxb)
+{
+ kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+
+ LASSERT(atomic_read(&rxb->rxb_refcount) == 0);
+ LASSERT(rxb->rxb_state == RXB_STATE_IDLE);
+ LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE));
+
+ spin_lock(&rxbp->rxbp_lock);
+ list_del(&rxb->rxb_list);
+ rxbp->rxbp_count--;
+ spin_unlock(&rxbp->rxbp_lock);
+
+ PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+ PORTAL_FREE(rxb,sizeof(*rxb));
+}
+
+
+
+void
+kptllnd_rx_buffer_callback(ptl_event_t *ev)
+{
+ kptl_rx_buffer_t *rxb = ev->md.user_ptr;
+ kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+ /*kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data;*/
+ kptl_rx_t *rx;
+ int nob;
+ int unlinked;
+
+ /*
+ * Set the local unlinked flag
+ */
+ unlinked = ev->type == PTL_EVENT_UNLINK;
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ if( ev->unlinked )
+ unlinked = 1;
+#endif
+
+ if(!rxbp->rxbp_shutdown){
+ PJK_UT_MSG("RXB Callback %s(%d) rxb=%p nid="LPX64" unlink=%d\n",
+ get_ev_type_string(ev->type),ev->type,
+ rxb,ev->initiator.nid,unlinked);
+ }
+
+ LASSERT( ev->md.start == rxb->rxb_buffer);
+ LASSERT( ev->offset + ev->mlength <= PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+ LASSERT( ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK);
+ LASSERT( ev->match_bits == LNET_MSG_MATCHBITS);
+
+ CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR,
+ "event type %d, status %d from "LPX64"\n",
+ ev->type, ev->ni_fail_type,ev->initiator.nid);
+
+ nob = ev->mlength;
+
+ if(unlinked){
+ spin_lock(&rxbp->rxbp_lock);
+
+ /*
+ * Remove this from the list
+ */
+ list_del_init(&rxb->rxb_list);
+
+ LASSERT(rxb->rxb_state == RXB_STATE_POSTED);
+ rxb->rxb_state = RXB_STATE_IDLE;
+ rxb->rxb_mdh = PTL_INVALID_HANDLE;
+
+ if( rxbp->rxbp_shutdown){
+ spin_unlock(&rxbp->rxbp_lock);
+ kptllnd_rx_buffer_decref(rxb,"portals");
+ return;
+ }
+
+ spin_unlock(&rxbp->rxbp_lock);
+
+
+ }
+
+ /*
+ * Handle failure by just dropping the path
+ */
+ if(ev->ni_fail_type != PTL_NI_OK){
+ CERROR("Message Dropped: ev status %d",ev->ni_fail_type);
+ if(unlinked)
+ kptllnd_rx_buffer_scheduled_post(rxb);
+ return;
+ }
+
+ /*
+ * Allocate an RX
+ */
+ rx = kptllnd_rx_alloc(rxb->rxb_po.po_kptllnd_data);
+ if(rx == 0){
+ CERROR("Message Dropped: Memory allocation failure");
+ if(unlinked)
+ kptllnd_rx_buffer_scheduled_post(rxb);
+ return;
+ }
+
+ PJK_UT_MSG_DATA("New RX=%p\n",rx);
+
+ /*
+ * If we are unlinked we can just transfer the ref
+ * that portals owned to the ref that this RX owns
+ * otherwise we need to add a ref specifically for this RX
+ */
+ if(!unlinked)
+ kptllnd_rx_buffer_addref(rxb,"rx");
+
+ rx->rx_msg = rxb->rxb_buffer + ev->offset;
+ rx->rx_rxb = rxb;
+ rx->rx_nob = nob;
+#ifdef TESTING_WITH_LOOPBACK
+ /*
+ * When testing with loopback on socknal
+ * packets are received on loopback NAL so
+ * until I figure out how to do that properly
+ * just make it look like it came from this NID
+ */
+ rx->rx_initiator = rxb->rxb_po.po_kptllnd_data->kptl_portals_id;
+#else
+ rx->rx_initiator = ev->initiator;
+#endif
+
+ kptllnd_rx_schedule(rx);
+
+ if(!rxbp->rxbp_shutdown){
+ PJK_UT_MSG("<<< rx=%p rxb=%p\n",rx,rxb);
+ }
+}
+
+
+void
+kptllnd_rx_schedule (kptl_rx_t *rx)
+{
+ unsigned long flags;
+ kptl_data_t *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data;
+
+ CDEBUG(D_NET, "rx\n");
+
+ PJK_UT_MSG("RX Schedule %p\n",rx);
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+ list_add_tail(&rx->rx_list,&kptllnd_data->kptl_sched_rxq);
+ wake_up(&kptllnd_data->kptl_sched_waitq);
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+
+void
+kptllnd_rx_scheduler_handler(kptl_rx_t *rx)
+{
+ int rc;
+ kptl_rx_buffer_t *rxb = rx->rx_rxb;
+ kptl_msg_t *msg = rx->rx_msg;
+ kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data;
+ kptl_peer_t *peer = NULL;
+ int returned_credits = 0;
+ int type = msg->ptlm_type;
+ lnet_nid_t lnet_initiator_nid = ptl2lnetnid(kptllnd_data,rx->rx_initiator.nid);
+
+
+ PJK_UT_MSG_DATA(">>> RXRXRXRXRXRXRXRXRXRXRXRX\n");
+ PJK_UT_MSG_DATA("rx=%p nob=%d\n",rx,rx->rx_nob);
+
+ /*
+ * If the nob==0 then silently discard this message
+ */
+ if(rx->rx_nob == 0)
+ goto exit;
+
+ rc = kptllnd_msg_unpack(msg, rx->rx_nob, kptllnd_data);
+ if (rc != 0) {
+ CERROR ("Error %d unpacking rx from "LPX64"\n",
+ rc, rx->rx_initiator.nid);
+ goto exit;
+ }
+
+ PJK_UT_MSG_DATA("RX=%p Type=%s(%d)\n",rx,
+ get_msg_type_string(type),type);
+ PJK_UT_MSG_DATA("Msg NOB = %d\n",msg->ptlm_nob);
+ PJK_UT_MSG_DATA("Returned Credits=%d\n",msg->ptlm_credits);
+ PJK_UT_MSG_DATA("Seq # ="LPX64"\n",msg->ptlm_seq);
+ PJK_UT_MSG_DATA("lnet RX nid=" LPX64 "\n",lnet_initiator_nid);
+ PJK_UT_MSG("ptl RX nid=" LPX64 "\n",rx->rx_initiator.nid);
+
+ if(type == PTLLND_MSG_TYPE_HELLO)
+ {
+ peer = kptllnd_peer_handle_hello(
+ kptllnd_data,
+ lnet_initiator_nid,
+ msg);
+ if( peer == NULL){
+ CERROR ("Failed to create peer for "LPX64"\n",
+ lnet_initiator_nid);
+ goto exit;
+ }
+
+ if (!( msg->ptlm_dststamp == kptllnd_data->kptl_incarnation ||
+ msg->ptlm_dststamp == 0)) {
+ CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+ peer->peer_nid,
+ msg->ptlm_dststamp,
+ kptllnd_data->kptl_incarnation );
+ goto exit;
+ }
+ }
+ else
+ {
+ peer = kptllnd_peer_find(kptllnd_data,lnet_initiator_nid);
+ if( peer == NULL){
+ CERROR ("No connection with "LPX64"\n",
+ lnet_initiator_nid);
+ goto exit;
+ }
+
+ if (msg->ptlm_dststamp != kptllnd_data->kptl_incarnation) {
+ CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+ peer->peer_nid,
+ msg->ptlm_dststamp,
+ kptllnd_data->kptl_incarnation );
+ goto exit;
+ }
+ }
+
+ if( msg->ptlm_srcnid != peer->peer_nid){
+ CERROR ("Stale rx srcnid "LPX64" expected "LPX64"\n",
+ msg->ptlm_srcnid,
+ peer->peer_nid );
+ goto exit;
+ }
+ if( msg->ptlm_srcstamp != peer->peer_incarnation){
+ CERROR ("Stale rx from "LPX64" srcstamp"LPX64" expected "LPX64"\n",
+ peer->peer_nid,
+ msg->ptlm_srcstamp,
+ peer->peer_incarnation );
+ goto exit;
+ }
+ if( msg->ptlm_dstnid != kptllnd_data->kptl_ni->ni_nid){
+ CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n",
+ peer->peer_nid,
+ msg->ptlm_dstnid,
+ kptllnd_data->kptl_ni->ni_nid );
+ goto exit;
+ }
+
+ /*
+ * Save the number of credits
+ */
+ returned_credits = msg->ptlm_credits;
+
+ /*
+ * Attach the peer to the RX
+ * it now is responsibly for releaseing the refrence
+ */
+ rx->rx_peer = peer;
+ peer = 0;
+
+ /*
+ * Note: We are explicitly ignore sequence #
+ * It is informational only
+ */
+ switch (msg->ptlm_type) {
+ default:
+ CERROR("Bad PTL message type %x from "LPX64"\n",
+ msg->ptlm_type, rx->rx_peer->peer_nid);
+ break;
+
+ case PTLLND_MSG_TYPE_HELLO:
+ PJK_UT_MSG("PTLLND_MSG_TYPE_HELLO\n");
+ break;
+
+ case PTLLND_MSG_TYPE_NOOP:
+ PJK_UT_MSG("PTLLND_MSG_TYPE_NOOP\n");
+ break;
+
+ case PTLLND_MSG_TYPE_IMMEDIATE:
+ PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n");
+ rc = lnet_parse(kptllnd_data->kptl_ni,
+ &msg->ptlm_u.immediate.kptlim_hdr,
+ msg->ptlm_srcnid,
+ rx);
+ /* RX Completing asynchronously */
+ if( rc >= 0)
+ rx = 0;
+ break;
+
+ case PLTLND_MSG_TYPE_PUT:
+ case PTLLND_MSG_TYPE_GET:
+ PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n",
+ msg->ptlm_type == PLTLND_MSG_TYPE_PUT ?
+ "PUT" : "GET");
+
+ /*
+ * Save the last match bits used
+ */
+ spin_lock(&rx->rx_peer->peer_lock);
+ if(msg->ptlm_u.req.kptlrm_matchbits > rx->rx_peer->peer_last_matchbits_seen)
+ rx->rx_peer->peer_last_matchbits_seen = msg->ptlm_u.req.kptlrm_matchbits;
+ spin_unlock(&rx->rx_peer->peer_lock);
+
+ rc = lnet_parse(kptllnd_data->kptl_ni,
+ &msg->ptlm_u.req.kptlrm_hdr,
+ msg->ptlm_srcnid,
+ rx);
+
+ /* RX Completing asynchronously */
+ if( rc >= 0)
+ rx = 0;
+ break;
+ }
+
+
+ CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
+ type, returned_credits, peer->peer_nid);
+
+exit:
+ /* PEER == NULL if it is not yet assigned or already
+ * been attached to RX */
+ if(peer)
+ kptllnd_peer_decref(peer,"lookup");
+
+ /* RX == NULL if it is completing asynchronously */
+ if(rx)
+ kptllnd_rx_decref(rx,"sched",kptllnd_data);
+
+ PJK_UT_MSG_DATA("<<< RXRXRXRXRXRXRXRXRXRXRXRX rx=%p\n",rx);
+ return;
+}
+
+void
+kptllnd_rx_buffer_addref(
+ kptl_rx_buffer_t *rxb,
+ const char *owner)
+{
+ atomic_inc(&rxb->rxb_refcount);
+
+#if 0
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
+ atomic_read(&rxb->rxb_refcount));
+#endif
+}
+
+void
+kptllnd_rx_buffer_decref(
+ kptl_rx_buffer_t *rxb,
+ const char *owner)
+{
+ if( !atomic_dec_and_test (&rxb->rxb_refcount)){
+
+#if 0
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner,
+ atomic_read(&rxb->rxb_refcount));
+#endif
+ return;
+ }
+
+#if 0
+ PJK_UT_MSG("rxb=%p owner=%s LAST REF reposting\n",rxb,owner);
+#endif
+
+ kptllnd_rx_buffer_post_handle_error(rxb);
+}
+
+kptl_rx_t*
+kptllnd_rx_alloc(
+ kptl_data_t *kptllnd_data )
+{
+ kptl_rx_t* rx;
+
+ if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_RX_ALLOC )){
+ PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
+ CERROR ("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n");
+ STAT_UPDATE(kps_rx_allocation_failed);
+ return 0;
+ }
+
+ rx = cfs_mem_cache_alloc ( kptllnd_data->kptl_rx_cache , GFP_ATOMIC);
+ if(rx == 0 ){
+ CERROR("Failed to allocate rx\n");
+ STAT_UPDATE(kps_rx_allocation_failed);
+
+ }else{
+
+ STAT_UPDATE(kps_rx_allocated);
+
+ memset(rx,0,sizeof(rx));
+
+ CFS_INIT_LIST_HEAD(&rx->rx_list);
+ atomic_set(&rx->rx_refcount,1);
+ }
+
+ return rx;
+}
+
+void
+kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data)
+{
+ kptl_peer_t *peer = rx->rx_peer;
+ kptl_msg_t *msg = rx->rx_msg;
+ int returned_credits = msg->ptlm_credits;
+
+ PJK_UT_MSG(">>> rx=%p\n",rx);
+
+ STAT_UPDATE(kps_rx_released);
+
+ LASSERT(atomic_read(&rx->rx_refcount)==0);
+
+ if(rx->rx_rxb){
+ PJK_UT_MSG("Release rxb=%p\n",rx->rx_rxb);
+ kptllnd_rx_buffer_decref(rx->rx_rxb,"rx");
+ rx->rx_rxb = 0;
+ }else{
+ PJK_UT_MSG("rxb already released\n");
+ }
+
+ if(peer){
+
+ /*
+ * Update credits
+ * (Only after I've reposted the buffer)
+ */
+ spin_lock(&peer->peer_lock);
+ peer->peer_credits += returned_credits;
+ LASSERT( peer->peer_credits <=
+ *kptllnd_tunables.kptl_peercredits);
+ peer->peer_outstanding_credits++;
+ LASSERT( peer->peer_outstanding_credits <=
+ *kptllnd_tunables.kptl_peercredits);
+ spin_unlock(&peer->peer_lock);
+
+ PJK_UT_MSG_DATA("Giving Back %d credits rx=%p\n",returned_credits,rx);
+
+ /* Have I received credits that will let me send? */
+ if (returned_credits != 0)
+ kptllnd_peer_check_sends(peer);
+
+ kptllnd_peer_decref(peer,"lookup");
+ }
+
+ cfs_mem_cache_free(kptllnd_data->kptl_rx_cache,rx);
+
+ PJK_UT_MSG("<<< rx=%p\n",rx);
+}
+
+void
+kptllnd_rx_addref(kptl_rx_t *rx,const char *owner)
+{
+ atomic_inc(&rx->rx_refcount);
+
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
+ atomic_read(&rx->rx_refcount));
+}
+
+void
+kptllnd_rx_decref(kptl_rx_t *rx,const char *owner,kptl_data_t *kptllnd_data)
+{
+ if( !atomic_dec_and_test (&rx->rx_refcount)){
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner,
+ atomic_read(&rx->rx_refcount));
+ return;
+ }
+
+ PJK_UT_MSG("rx=%p owner=%s LAST REF destroying\n",rx,owner);
+
+ kptllnd_rx_destroy(rx,kptllnd_data);
+}
+
--- /dev/null
+#include "ptllnd.h"
+
+
+void
+kptllnd_tx_schedule (kptl_tx_t *tx);
+
+
+
+int
+kptllnd_setup_tx_descs (kptl_data_t *kptllnd_data)
+{
+ kptl_tx_t *tx;
+ int i;
+
+ PJK_UT_MSG("\n");
+
+ /*
+ * First initialize the tx descriptors
+ */
+ memset(kptllnd_data->kptl_tx_descs, 0,
+ PTLLND_TX_MSGS() * sizeof(kptl_tx_t));
+
+ for (i = 0; i < PTLLND_TX_MSGS(); i++) {
+ tx = &kptllnd_data->kptl_tx_descs[i];
+
+
+ kptllnd_posted_object_setup(&tx->tx_po,
+ kptllnd_data,
+ POSTED_OBJECT_TYPE_TX);
+
+ CFS_INIT_LIST_HEAD(&tx->tx_list);
+ CFS_INIT_LIST_HEAD(&tx->tx_schedlist);
+
+ /*
+ * Determine if this is a regular or reserved descriptor
+ */
+ tx->tx_isnblk = (i >= *kptllnd_tunables.kptl_ntx);
+
+ /*
+ * Set the state
+ */
+ tx->tx_state = TX_STATE_ON_IDLE_QUEUE;
+
+ PORTAL_ALLOC( tx->tx_msg, *kptllnd_tunables.kptl_max_immd_size );
+ if(tx->tx_msg == 0){
+ CERROR("Failed to allocate TX payload\n");
+ kptllnd_cleanup_tx_descs(kptllnd_data);
+ }
+
+
+ /*
+ * Add this to the correct queue
+ */
+ if (tx->tx_isnblk)
+ list_add (&tx->tx_list,
+ &kptllnd_data->kptl_idle_nblk_txs);
+ else
+ list_add (&tx->tx_list,
+ &kptllnd_data->kptl_idle_txs);
+ }
+
+ return (0);
+}
+
+void
+kptllnd_cleanup_tx_descs(kptl_data_t *kptllnd_data)
+{
+ kptl_tx_t *tx;
+ int i;
+
+ PJK_UT_MSG("\n");
+
+ for (i = 0; i < PTLLND_TX_MSGS(); i++) {
+ tx = &kptllnd_data->kptl_tx_descs[i];
+
+
+ /*
+ * Handle partial initization by stopping
+ * when we hit one that is not fully initialized
+ */
+ if( tx->tx_msg == 0 )
+ break;
+
+ LASSERT( tx->tx_state == TX_STATE_ON_IDLE_QUEUE );
+
+ PORTAL_FREE(tx->tx_msg,*kptllnd_tunables.kptl_max_immd_size);
+ }
+}
+
+kptl_tx_t *
+kptllnd_get_idle_tx(
+ kptl_data_t *kptllnd_data,
+ int may_block,
+ kptl_tx_type_t purpose)
+{
+ kptl_tx_t *tx = NULL;
+ ENTRY;
+
+ PJK_UT_MSG(">>> may_block=%d purpose=%d\n",may_block,purpose);
+
+ if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_PUT_ALLOC ) && purpose == TX_TYPE_LARGE_PUT){
+ PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
+ CERROR ("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n");
+ tx = NULL;
+ STAT_UPDATE(kpt_tx_allocation_failed);
+ goto exit;
+ }
+ if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_GET_ALLOC ) && purpose == TX_TYPE_LARGE_GET){
+ PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
+ CERROR ("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n");
+ tx = NULL;
+ STAT_UPDATE(kpt_tx_allocation_failed);
+ goto exit;
+ }
+ if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX )){
+ PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX SIMULATION triggered\n");
+ CERROR ("FAIL_BLOCKING_TX SIMULATION triggered\n");
+ tx = NULL;
+ STAT_UPDATE(kpt_tx_allocation_failed);
+ goto exit;
+ }
+
+ while ( !kptllnd_data->kptl_shutdown ) {
+
+ spin_lock(&kptllnd_data->kptl_tx_lock);
+
+ /* "normal" descriptor is free */
+ if (!list_empty (&kptllnd_data->kptl_idle_txs)) {
+ tx = list_entry (kptllnd_data->kptl_idle_txs.next,
+ kptl_tx_t, tx_list);
+ break;
+ }
+
+ if (!may_block) {
+ /* may dip into reserve pool */
+ if (list_empty (&kptllnd_data->kptl_idle_nblk_txs)) {
+ CERROR ("reserved tx desc pool exhausted\n");
+ break;
+ }
+
+ tx = list_entry (kptllnd_data->kptl_idle_nblk_txs.next,
+ kptl_tx_t, tx_list);
+ break;
+ }
+
+ /* block for idle tx */
+ spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+ wait_event (kptllnd_data->kptl_idle_tx_waitq,
+ !list_empty (&kptllnd_data->kptl_idle_txs) ||
+ kptllnd_data->kptl_shutdown);
+ }
+
+ if (tx != NULL) {
+
+ /*
+ * Check the state
+ */
+ LASSERT(tx->tx_state == TX_STATE_ON_IDLE_QUEUE);
+
+ /*
+ * Reference is now owned by caller
+ */
+ LASSERT(atomic_read(&tx->tx_refcount)== 0);
+ atomic_set(&tx->tx_refcount,1);
+
+ /*
+ * Remove it from the idle queue
+ */
+ list_del_init (&tx->tx_list);
+
+ /*
+ * Set the state and type
+ */
+ tx->tx_state = TX_STATE_ALLOCATED;
+ tx->tx_type = purpose;
+
+ /*
+ * Initialize the TX descriptor so that cleanup can be
+ * handled easily even with a partially initialized descriptor
+ */
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+ tx->tx_ptlmsg = 0;
+ tx->tx_ptlmsg_reply = 0;
+ tx->tx_peer = 0;
+ tx->tx_associated_rx = 0;
+
+ /*
+ * These must be re-initialized
+ */
+ tx->tx_status = -EINVAL;
+ tx->tx_seen_send_end = 0;
+ tx->tx_seen_reply_end = 0;
+ tx->tx_payload_niov = 0;
+ tx->tx_payload_iov = 0;
+ tx->tx_payload_kiov = 0;
+ tx->tx_payload_offset = 0;
+ tx->tx_payload_nob = 0;
+
+ STAT_UPDATE(kps_tx_allocated);
+ }else{
+ STAT_UPDATE(kpt_tx_allocation_failed);
+ }
+
+ spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+exit:
+ PJK_UT_MSG("<<< tx=%p\n",tx);
+
+ RETURN(tx);
+}
+
+void
+kptllnd_tx_done (kptl_tx_t *tx)
+{
+ kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
+ LASSERT (!in_interrupt());
+
+ PJK_UT_MSG(">>> tx=%p\n",tx);
+
+ LASSERT(tx->tx_state != TX_STATE_ON_IDLE_QUEUE);
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+ LASSERT(atomic_read(&tx->tx_refcount) == 0);
+ LASSERT(list_empty(&tx->tx_schedlist)); /*not any the scheduler list*/
+
+ if(tx->tx_ptlmsg != 0){
+ PJK_UT_MSG("tx=%p finalize\n",tx);
+ lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg, tx->tx_status);
+ tx->tx_ptlmsg = 0;
+ }
+ if(tx->tx_ptlmsg_reply != 0){
+ PJK_UT_MSG("tx=%p finalize reply\n",tx);
+ lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg_reply, tx->tx_status);
+ tx->tx_ptlmsg_reply = 0;
+ }
+
+ /*
+ * Release the associated RX if there is one
+ */
+ if(tx->tx_associated_rx){
+ PJK_UT_MSG("tx=%p destroy associated rx %p\n",tx,tx->tx_associated_rx);
+ kptllnd_rx_decref(tx->tx_associated_rx,"tx",kptllnd_data);
+ tx->tx_associated_rx =0;
+ }
+
+ /*
+ * Cleanup resources associate with the peer
+ */
+ if(tx->tx_peer){
+ PJK_UT_MSG("tx=%p detach from peer=%p\n",tx,tx->tx_peer);
+ kptllnd_peer_dequeue_tx(tx->tx_peer,tx);
+ kptllnd_peer_decref(tx->tx_peer,"tx");
+ tx->tx_peer = NULL;
+ }
+
+ LASSERT(list_empty(&tx->tx_list)); /* removed from any peer list*/
+
+ /*
+ * state = back on idle queue
+ */
+ tx->tx_state = TX_STATE_ON_IDLE_QUEUE;
+
+ /*
+ * Put this tx descriptor back on the correct idle queue
+ * If this is a "normal" descriptor then somebody might
+ * be waiting so wake them up
+ */
+ spin_lock(&kptllnd_data->kptl_tx_lock);
+
+ if (tx->tx_isnblk) {
+ list_add (&tx->tx_list, &kptllnd_data->kptl_idle_nblk_txs);
+ } else {
+ list_add (&tx->tx_list, &kptllnd_data->kptl_idle_txs);
+ wake_up (&kptllnd_data->kptl_idle_tx_waitq);
+ }
+
+ STAT_UPDATE(kps_tx_released);
+
+ spin_unlock(&kptllnd_data->kptl_tx_lock);
+
+ PJK_UT_MSG("<<< tx=%p\n",tx);
+}
+
+void
+kptllnd_tx_schedule (kptl_tx_t *tx)
+{
+ kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data;
+ unsigned long flags;
+
+ PJK_UT_MSG("tx=%p\n",tx);
+
+ spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags);
+ LASSERT(list_empty(&tx->tx_schedlist));
+ list_add_tail(&tx->tx_schedlist,&kptllnd_data->kptl_sched_txq);
+ wake_up(&kptllnd_data->kptl_sched_waitq);
+ spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags);
+}
+
+void
+kptllnd_tx_callback(ptl_event_t *ev)
+{
+ kptl_tx_t *tx = ev->md.user_ptr;
+ kptl_peer_t *peer;
+ int rc;
+ int do_decref = 0;
+
+ PJK_UT_MSG(">>> %s(%d) tx=%p fail=%d\n",
+ get_ev_type_string(ev->type),ev->type,tx,ev->ni_fail_type);
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ PJK_UT_MSG("ev->unlinked=%d\n",ev->unlinked);
+#endif
+
+ if(ev->type == PTL_EVENT_UNLINK ){
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ /*
+ * Ignore unlink events if we don't
+ * have lustre semantics as these only occur
+ * in one-to-one correspondence with OPXXX_END
+ * event's and we've already cleaned up in
+ * those cases.
+ */
+ PJK_UT_MSG("<<<\n");
+ return;
+#else
+ /*
+ * Clear the handles
+ */
+ if(PtlHandleIsEqual(ev->md_handle,tx->tx_mdh))
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ else if (PtlHandleIsEqual(ev->md_handle,tx->tx_mdh_msg))
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+
+ tx->tx_status = -EINVAL;
+ kptllnd_tx_scheduled_decref(tx);
+ PJK_UT_MSG("<<<\n");
+ return;
+#endif
+ }
+
+ LASSERT(tx->tx_peer != NULL);
+ peer = tx->tx_peer;
+
+ spin_lock(&peer->peer_lock);
+
+ /*
+ * Save the status flag
+ */
+ tx->tx_status = ev->ni_fail_type == PTL_NI_OK ? 0 : -EINVAL;
+
+ switch(ev->type)
+ {
+ case PTL_EVENT_SEND_END:
+
+ /*
+ * Mark that we've seen an SEND END
+ */
+ tx->tx_seen_send_end = 1;
+
+ switch(tx->tx_type)
+ {
+ default:
+ LBUG();
+ break;
+
+ case TX_TYPE_SMALL_MESSAGE:
+ PJK_UT_MSG("TX_TYPE_SMALL_MESSAGE\n");
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE));
+
+ /*
+ * Success or failure we are done with the Message MD
+ */
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ break;
+
+ case TX_TYPE_LARGE_PUT:
+ case TX_TYPE_LARGE_GET:
+ PJK_UT_MSG("TX_TYPE_LARGE_%s\n",
+ tx->tx_type == TX_TYPE_LARGE_PUT ?
+ "PUT" : "GET");
+ /*
+ * Success or failure we are done with the Message MD
+ */
+ tx->tx_mdh_msg = PTL_INVALID_HANDLE;
+
+ /*
+ * There was an error, and we're not going to make any more
+ * progress (obviously) and the
+ * PUT_END or GET_END is never going to come.
+ */
+ if(ev->ni_fail_type != PTL_NI_OK ){
+
+ /*
+ * There was a error in the message
+ * we can safely unlink the MD
+ *
+ */
+ if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){
+ LASSERT(atomic_read(&tx->tx_refcount)>1);
+ rc = PtlMDUnlink(tx->tx_mdh);
+ LASSERT(rc == 0);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ /*
+ * We are holding another reference
+ * so this is not going to do anything
+ * but decrement the tx->ref_count
+ */
+ kptllnd_tx_decref(tx);
+#endif
+ }
+ }
+
+ do_decref = 1;
+ break;
+
+ case TX_TYPE_LARGE_PUT_RESPONSE:
+ PJK_UT_MSG("TX_TYPE_LARGE_PUT_RESPONSE\n");
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+
+ /*
+ * If'we've already seen the reply end
+ * or if this is a failure and we're NEVER going
+ * to see the reply end, release our reference here
+ */
+ if(tx->tx_seen_reply_end || ev->ni_fail_type != PTL_NI_OK){
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ }
+ break;
+
+ case TX_TYPE_LARGE_GET_RESPONSE:
+ PJK_UT_MSG("TX_TYPE_LARGE_GET_RESPONSE\n");
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+
+ /*
+ * Success or failure we are done with the MD
+ */
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ break;
+ }
+ break;
+
+ case PTL_EVENT_GET_END:
+ LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT);
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ break;
+ case PTL_EVENT_PUT_END:
+ LASSERT(tx->tx_type == TX_TYPE_LARGE_GET);
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ break;
+ case PTL_EVENT_REPLY_END:
+ LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE);
+ LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE));
+ tx->tx_seen_reply_end = 1;
+ if(tx->tx_seen_send_end){
+ tx->tx_mdh = PTL_INVALID_HANDLE;
+ do_decref = 1;
+ }
+ break;
+ default:
+ LBUG();
+ }
+
+ spin_unlock(&peer->peer_lock);
+
+ if(do_decref)
+ kptllnd_tx_scheduled_decref(tx);
+ PJK_UT_MSG("<<< decref=%d\n",do_decref);
+}
+
+void
+kptllnd_tx_addref(
+ kptl_tx_t *tx)
+{
+ atomic_inc(&tx->tx_refcount);
+}
+
+void
+kptllnd_tx_decref(
+ kptl_tx_t *tx)
+{
+ if( !atomic_dec_and_test(&tx->tx_refcount)){
+ return;
+ }
+
+ PJK_UT_MSG("tx=%p LAST REF\n",tx);
+ kptllnd_tx_done(tx);
+}
+
+void
+kptllnd_tx_scheduled_decref(
+ kptl_tx_t *tx)
+{
+ if( !atomic_dec_and_test(&tx->tx_refcount)){
+ /*
+ * The below message could actually be out of sync
+ * with the real ref count, and is for informational purposes
+ * only
+ */
+ PJK_UT_MSG("tx=%p count=%d\n",tx,
+ atomic_read(&tx->tx_refcount));
+ return;
+ }
+
+ PJK_UT_MSG("tx=%p LAST REF\n",tx);
+ kptllnd_tx_schedule(tx);
+}
--- /dev/null
+/************************************************************************
+ * Portal NAL Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct
+{
+ lnet_hdr_t kptlim_hdr; /* portals header */
+ char kptlim_payload[0]; /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+ lnet_hdr_t kptlrm_hdr; /* portals header */
+ __u64 kptlrm_matchbits; /* matchbits */
+} WIRE_ATTR kptl_request_msg_t;
+
+typedef struct
+{
+ __u64 kptlhm_matchbits; /* matchbits */
+ __u32 kptlhm_max_immd_size; /* immd message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct kptl_msg
+{
+ /* First 2 fields fixed FOR ALL TIME */
+ __u32 ptlm_magic; /* I'm an ptl NAL message */
+ __u16 ptlm_version; /* this is my version number */
+ __u8 ptlm_type; /* the message type */
+ __u8 ptlm_credits; /* returned credits */
+ __u32 ptlm_nob; /* # bytes in whole message */
+ __u32 ptlm_cksum; /* checksum (0 == no checksum) */
+ __u64 ptlm_srcnid; /* sender's NID */
+ __u64 ptlm_srcstamp; /* sender's incarnation */
+ __u64 ptlm_dstnid; /* destination's NID */
+ __u64 ptlm_dststamp; /* destination's incarnation */
+ __u64 ptlm_seq; /* sequence number */
+
+ union {
+ kptl_immediate_msg_t immediate;
+ kptl_request_msg_t req;
+ kptl_hello_msg_t hello;
+ } WIRE_ATTR ptlm_u;
+
+}kptl_msg_t;
+
+#define PTLLND_MSG_MAGIC 0x50746C4E /* 'PtlN' unique magic */
+#define PTLLND_MSG_VERSION 0x01
+
+#define PTLLND_MSG_TYPE_INVALID 0x00
+#define PLTLND_MSG_TYPE_PUT 0x01
+#define PTLLND_MSG_TYPE_GET 0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE 0x03 /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP 0x04
+#define PTLLND_MSG_TYPE_HELLO 0x05
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <portals/api-support.h>
+
+/* This ghastly hack to allows me to include lib-types.h It doesn't affect any
+ * assertions generated here (but fails-safe if it ever does) */
+typedef struct {
+ int counter;
+} atomic_t;
+
+#include <portals/lib-types.h>
+
+#include "ptlnal_wire.h"
+
+#ifndef HAVE_STRNLEN
+#define strnlen(s, i) strlen(s)
+#endif
+
+#define BLANK_LINE() \
+do { \
+ printf ("\n"); \
+} while (0)
+
+#define COMMENT(c) \
+do { \
+ printf (" /* "c" */\n"); \
+} while (0)
+
+#undef STRINGIFY
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a) \
+do { \
+ printf (" CLASSERT ("#a" == "STRINGIFY(a)");\n"); \
+} while (0)
+
+#define CHECK_VALUE(a) \
+do { \
+ printf (" CLASSERT ("#a" == %d);\n", a); \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m) \
+do { \
+ CHECK_VALUE((int)offsetof(s, m)); \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m) \
+do { \
+ CHECK_VALUE((int)sizeof(((s *)0)->m)); \
+} while (0)
+
+#define CHECK_MEMBER(s,m) \
+do { \
+ CHECK_MEMBER_OFFSET(s, m); \
+ CHECK_MEMBER_SIZEOF(s, m); \
+} while (0)
+
+#define CHECK_STRUCT(s) \
+do { \
+ BLANK_LINE (); \
+ COMMENT ("Checks for struct "#s); \
+ CHECK_VALUE((int)sizeof(s)); \
+} while (0)
+
+void
+system_string (char *cmdline, char *str, int len)
+{
+ int fds[2];
+ int rc;
+ pid_t pid;
+
+ rc = pipe (fds);
+ if (rc != 0)
+ abort ();
+
+ pid = fork ();
+ if (pid == 0) {
+ /* child */
+ int fd = fileno(stdout);
+
+ rc = dup2(fds[1], fd);
+ if (rc != fd)
+ abort();
+
+ exit(system(cmdline));
+ /* notreached */
+ } else if ((int)pid < 0) {
+ abort();
+ } else {
+ FILE *f = fdopen (fds[0], "r");
+
+ if (f == NULL)
+ abort();
+
+ close(fds[1]);
+
+ if (fgets(str, len, f) == NULL)
+ abort();
+
+ if (waitpid(pid, &rc, 0) != pid)
+ abort();
+
+ if (!WIFEXITED(rc) ||
+ WEXITSTATUS(rc) != 0)
+ abort();
+
+ if (strnlen(str, len) == len)
+ str[len - 1] = 0;
+
+ if (str[strlen(str) - 1] == '\n')
+ str[strlen(str) - 1] = 0;
+
+ fclose(f);
+ }
+}
+
+int
+main (int argc, char **argv)
+{
+ char unameinfo[80];
+ char gccinfo[80];
+
+ system_string("uname -a", unameinfo, sizeof(unameinfo));
+ system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo));
+
+ printf ("void vibnal_assert_wire_constants (void)\n"
+ "{\n"
+ " /* Wire protocol assertions generated by 'wirecheck'\n"
+ " * running on %s\n"
+ " * with %s */\n"
+ "\n", unameinfo, gccinfo);
+
+ BLANK_LINE ();
+
+ COMMENT ("Constants...");
+ CHECK_DEFINE (PTLLND_MSG_MAGIC);
+ CHECK_DEFINE (PTLLND_MSG_VERSION);
+
+ CHECK_DEFINE (PTLLND_MSG_TYPE_INVALID);
+ CHECK_DEFINE (PLTLND_MSG_TYPE_PUT);
+ CHECK_DEFINE (PTLLND_MSG_TYPE_GET);
+ CHECK_DEFINE (PTLLND_MSG_TYPE_IMMEDIATE);
+ CHECK_DEFINE (PTLLND_MSG_TYPE_NOOP);
+ CHECK_DEFINE (PTLLND_MSG_TYPE_HELLO);
+
+ CHECK_STRUCT (kptl_msg_t);
+ CHECK_MEMBER (kptl_msg_t, ptlm_magic);
+ CHECK_MEMBER (kptl_msg_t, ptlm_version);
+ CHECK_MEMBER (kptl_msg_t, ptlm_type);
+ CHECK_MEMBER (kptl_msg_t, ptlm_credits);
+ CHECK_MEMBER (kptl_msg_t, ptlm_nob);
+ CHECK_MEMBER (kptl_msg_t, ptlm_cksum);
+ CHECK_MEMBER (kptl_msg_t, ptlm_srcnid);
+ CHECK_MEMBER (kptl_msg_t, ptlm_srcstamp);
+ CHECK_MEMBER (kptl_msg_t, ptlm_dstnid);
+ CHECK_MEMBER (kptl_msg_t, ptlm_dststamp);
+ CHECK_MEMBER (kptl_msg_t, ptlm_seq);
+ CHECK_MEMBER (kib_msg_t, ptlm_u.immediate);
+ CHECK_MEMBER (kib_msg_t, ptlm_u.req);
+ CHECK_MEMBER (kib_msg_t, ptlm_u.hello);
+
+ CHECK_STRUCT (kptl_immediate_msg_t);
+ CHECK_MEMBER (kptl_immediate_msg_t, kptlim_hdr);
+ CHECK_MEMBER (kptl_immediate_msg_t, kptlim_payload[13]);
+
+ CHECK_STRUCT (kptl_request_msg_t);
+ CHECK_MEMBER (kptl_request_msg_t, kptlrm_hdr);
+ CHECK_MEMBER (kptl_request_msg_t, kptlrm_matchbits);
+
+ CHECK_STRUCT (kptl_hello_msg_t);
+ CHECK_MEMBER (kptl_hello_msg_t, kptlhm_matchbits);
+ CHECK_MEMBER (kptl_hello_msg_t, kptlhm_max_immd_size);
+
+ printf ("}\n\n");
+
+ return (0);
+}
.nf_modname = "kgmlnd",
.nf_addr2str = libcfs_num_addr2str,
.nf_str2addr = libcfs_num_str2addr},
+ {.nf_type = PTLLND,
+ .nf_name = "ptl",
+ .nf_modname = "kptllnd",
+ .nf_addr2str = libcfs_num_addr2str,
+ .nf_str2addr = libcfs_num_str2addr},
/* placeholder for net0 alias. It MUST BE THE LAST ENTRY */
{.nf_type = -1},
};
}
static volatile int seen = 0;
+static volatile int seen_unlink = 0;
static inline void handler(lnet_event_t *ev)
{
PJK_UT_MSG("md.user_ptr=%p\n",ev->md.user_ptr);
PJK_UT_MSG("-------- EVENT END --------------\n");
++seen;
+ if(ev->unlinked)++seen_unlink;
}
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*/
-
- #define UT_MSG_MODULE_NAME "utcli "
+
+ #define UT_MSG_MODULE_NAME "utcli "
#include "ut.h"
int pkt_size = 300;
PORTAL_ALLOC (buffer, pkt_size);
if (buffer == NULL)
{
- CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size);
+ CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
return -ENOMEM;
}
target.pid = 0;
target.nid = libcfs_str2nid(nid);
-
+
PJK_UT_MSG("target.nid="LPX64"\n",target.nid);
-
+
for(i=0;i<1;i++)
{
if(get){
while(i++ < 10 && seen == 0)
cfs_pause(cfs_time_seconds(1));
if(seen == 0)
- PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
+ PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
else{
int good;
if(get){
}else{
good = 1;
}
-
+
if(good)
PJK_UT_MSG("------------------COMPLETE--------------------\n");
else
- PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
- }
+ PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
+ }
exit5:
PJK_UT_MSG("LNetMDUnlink()\n");
LNetMDUnlink(mdh);
+
+ if(!seen_unlink){
+ PJK_UT_MSG("------------Waiting for UNLINK ------------\n");
+ i=0;
+ while(i++ < 120 && seen_unlink == 0)
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ cfs_pause(cfs_time_seconds(1));
exit4:
PJK_UT_MSG("LNetEQFree()\n");
LNetEQFree(eqh);
static void /*__exit*/ utcli_cleanup(void)
{
PJK_UT_MSG(">>>\n");
- PJK_UT_MSG("<<<\n");
+ PJK_UT_MSG("<<<\n");
} /* utcli_cleanup() */
*/
-#define UT_MSG_MODULE_NAME "utsrv "
+#define UT_MSG_MODULE_NAME "utsrv "
#include "ut.h"
lnet_process_id_t anypid;
lnet_process_id_t mypid;
lnet_md_t md;
-
+
PJK_UT_MSG(">>>\n");
- PJK_UT_MSG("pkt_size=%d\n",pkt_size);
+ PJK_UT_MSG("pkt_size=%d\n",pkt_size);
PJK_UT_MSG("auto_unlink=%d\n",auto_unlink);
-
+
PJK_UT_MSG("PORTAL_ALLOC\n");
PORTAL_ALLOC (buffer, pkt_size);
if (buffer == NULL)
{
- CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size);
+ CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
rc = -ENOMEM;
goto exit0;
- }
-
+ }
+
PJK_UT_MSG("LNetNiInit()\n");
rc = LNetNIInit(0);
if (rc < 0)
{
CERROR ("LNetNIInit: error %d\n", rc);
goto exit1;
- }
-
+ }
+
LNetGetId(0,&mypid);
PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid);
- PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);
-
+ PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);
+
PJK_UT_MSG("LNetEQAlloc\n");
rc = LNetEQAlloc(
64, /* max number of envents why 64? */
if(rc != 0) {
CERROR("LNetEQAlloc failed %d\n",rc);
goto exit2;
- }
-
+ }
+
anypid.nid = LNET_NID_ANY;
anypid.pid = LNET_PID_ANY;
if(rc != 0) {
CERROR("LNetMeAttach failed %d\n",rc);
goto exit3;
- }
-
+ }
+
md.start = buffer;
md.length = pkt_size;
md.threshold = auto_unlink ? 1 : 100;
if(rc != 0){
CERROR("LNetMDAttach failed %d\n",rc);
goto exit4;
- }
-
+ }
+
rc = 0;
goto exit0;
-
-exit4:
+
+exit4:
PJK_UT_MSG("LNetMEUnlink()\n");
LNetMEUnlink(meh);
-exit3:
+exit3:
PJK_UT_MSG("LNetEQFree()\n");
LNetEQFree(eqh);
exit2:
PJK_UT_MSG("LNetNiFini()\n");
LNetNIFini();
exit1:
- PORTAL_FREE(buffer,pkt_size);
+ PORTAL_FREE(buffer,pkt_size);
exit0:
PJK_UT_MSG("<<< rc=%d\n",rc);
return rc;
-
+
} /* utsrv_init() */
PJK_UT_MSG("LNetMDUnlink()\n");
LNetMDUnlink(mdh);
PJK_UT_MSG("LNetMEUnlink()\n");
- LNetMEUnlink(meh);
+ LNetMEUnlink(meh);
PJK_UT_MSG("LNetEQFree()\n");
- LNetEQFree(eqh);
+ LNetEQFree(eqh);
PJK_UT_MSG("LNetNiFini()\n");
LNetNIFini();
- PORTAL_FREE(buffer,pkt_size);
+ PORTAL_FREE(buffer,pkt_size);
PJK_UT_MSG("<<<\n");
} /* utsrv_cleanup() */