From: pjkirner Date: Wed, 21 Sep 2005 02:08:05 +0000 (+0000) Subject: b=7982 X-Git-Tag: v1_7_100~1^25~6^2~168 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=39392995b4b2b1717476e31a6f8167d1e8d72001;p=fs%2Flustre-release.git b=7982 * Portals LND --- diff --git a/lnet/klnds/ptllnd/Makefile.in b/lnet/klnds/ptllnd/Makefile.in index ae5477c..2435b58 100755 --- a/lnet/klnds/ptllnd/Makefile.in +++ b/lnet/klnds/ptllnd/Makefile.in @@ -1,7 +1,7 @@ MODULES := kptllnd -kptllnd-objs := ptllnd.o +EXTRA_POST_CFLAGS := @PTLLNDCPPFLAGS@ -EXTRA_PRE_CFLAGS := @PTLLNDCPPFLAGS@ +kptllnd-objs := ptllnd_rx_buf.o ptllnd_tx.o ptllnd.o ptllnd_cb.o ptllnd_modparams.o ptllnd_peer.o @INCLUDE_RULES@ diff --git a/lnet/klnds/ptllnd/README b/lnet/klnds/ptllnd/README new file mode 100644 index 0000000..d6cfc37 --- /dev/null +++ b/lnet/klnds/ptllnd/README @@ -0,0 +1,47 @@ +1. This version of the Portals LND is intended to work on the Cray XT3 using + Cray Portals as a network transport. + +2. To enable the building of the Portals LND (ptllnd.ko) configure with the + following option: + ./configure --with-portals= + +3. The following configuration options are supported + + ntx: + The total number of message descritprs + + concurrent_peers: + The maximum number of conncurent peers. Peers attemting + to connect beyond the maximum will not be allowd. + + peer_hash_table_size: + The number of hash table slots for the peers. This number + should scale with concurrent_peers. + + cksum: + Set to non-zero to enable message (not RDMA) checksums for + outgoing packets. Incoming packets will always be checksumed + if necssary, independnt of this value. + + timeout: + The amount of time a request can linger in a peers active + queue, before the peer is considered dead. Units: seconds. + + portal: + The portal ID to use for the ptllnd traffic. + + rxb_npages: + The number of pages in a RX Buffer. + + credits: + The maximum total number of concurrent sends that are + outstanding at any given instant. + + peercredits: + The maximum number of concurrent sends that are + outstanding to a single piere at any given instant. + + max_immd_size: + The maximum immedate message size. This MUST be + the same on all nodes in a cluster. A peer connecting + with a diffrent max_immd_size will be rejected. diff --git a/lnet/klnds/ptllnd/autoMakefile.am b/lnet/klnds/ptllnd/autoMakefile.am index f2f6175..2ac93ab 100755 --- a/lnet/klnds/ptllnd/autoMakefile.am +++ b/lnet/klnds/ptllnd/autoMakefile.am @@ -10,4 +10,4 @@ endif endif MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h +DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h ptllnd_wire.h diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index 3688f5dd..caed511 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -23,46 +23,765 @@ #include "ptllnd.h" -lnet_handle_ni_t nih; - - + +lnd_t kptllnd_lnd = { + .lnd_type = PTLLND, + .lnd_startup = kptllnd_startup, + .lnd_shutdown = kptllnd_shutdown, + .lnd_ctl = kptllnd_ctl, + .lnd_send = kptllnd_send, + .lnd_recv = kptllnd_recv, +}; + +kptl_data_t kptllnd_data; +kptl_stats_t kptllnd_stats; + +void kptllnd_shutdown (lnet_ni_t *ni); + +void ptllnd_assert_wire_constants (void) +{ + /* TBD - auto generated */ +} + +__u32 +kptllnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +void +kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob) +{ + msg->ptlm_type = type; + msg->ptlm_nob = offsetof(kptl_msg_t, ptlm_u) + body_nob; +} + +void +kptllnd_msg_pack( + kptl_msg_t *msg, + int credits, + lnet_nid_t dstnid, + __u64 dststamp, + __u64 seq, + kptl_data_t *kptllnd_data) +{ + msg->ptlm_magic = PTLLND_MSG_MAGIC; + msg->ptlm_version = PTLLND_MSG_VERSION; + /* msg->ptlm_type Filled in kptllnd_init_msg() */ + msg->ptlm_credits = credits; + /* msg->ptlm_nob Filled in kptllnd_init_msg() */ + msg->ptlm_cksum = 0; + msg->ptlm_srcnid = kptllnd_data->kptl_ni->ni_nid; + msg->ptlm_srcstamp = kptllnd_data->kptl_incarnation; + msg->ptlm_dstnid = dstnid; + msg->ptlm_dststamp = dststamp; + msg->ptlm_seq = seq; + + if (*kptllnd_tunables.kptl_cksum) { + /* NB ptlm_cksum zero while computing cksum */ + msg->ptlm_cksum = kptllnd_cksum(msg, msg->ptlm_nob); + } +} + +int +kptllnd_msg_unpack(kptl_msg_t *msg, int nob,kptl_data_t *kptllnd_data) +{ + const int hdr_size = offsetof(kptl_msg_t, ptlm_u); + __u32 msg_cksum; + int flip; + int msg_nob; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Very Short message: %d\n", nob); + return -EPROTO; + } + + /* + * Determine if we need to flip + */ + if (msg->ptlm_magic == PTLLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ptlm_magic); + return -EPROTO; + } + + if (msg->ptlm_version != + (flip ? __swab16(PTLLND_MSG_VERSION) : PTLLND_MSG_VERSION)) { + CERROR("Bad version: got %d expected %d\n", + msg->ptlm_version,PTLLND_MSG_VERSION); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short header: got %d, wanted at least %d\n", + nob, hdr_size); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ptlm_nob) : msg->ptlm_nob; + if (nob != msg_nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with + * 1) ptlm_cksum zero and + * 2) BEFORE anything gets modified/flipped + */ + msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum; + msg->ptlm_cksum = 0; + if (msg_cksum != 0){ + STAT_UPDATE(kps_incoming_checksums_calculated); + if( msg_cksum != kptllnd_cksum(msg, msg_nob) ) { + STAT_UPDATE(kps_incoming_checksums_invalid); + CERROR("Bad checksum\n"); + return -EPROTO; + } + } + + /* Restore the checksum */ + msg->ptlm_cksum = msg_cksum; + + if(flip){ + /* leave magic unflipped as a clue to peer endianness */ + __swab16s(&msg->ptlm_version); + /* These two are 1 byte long so we don't swap them + But check this assumtion*/ + CLASSERT (sizeof(msg->ptlm_type) == 1); + CLASSERT (sizeof(msg->ptlm_credits) == 1); + msg->ptlm_nob = msg_nob; + __swab64s(&msg->ptlm_srcnid); + __swab64s(&msg->ptlm_srcstamp); + __swab64s(&msg->ptlm_dstnid); + __swab64s(&msg->ptlm_dststamp); + __swab64s(&msg->ptlm_seq); + + switch(msg->ptlm_type) + { + case PLTLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + __swab64s(&msg->ptlm_u.req.kptlrm_matchbits); + break; + case PTLLND_MSG_TYPE_IMMEDIATE: + case PTLLND_MSG_TYPE_NOOP: + /* Do nothing */ + break; + case PTLLND_MSG_TYPE_HELLO: + __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits); + __swab32s(&msg->ptlm_u.hello.kptlhm_max_immd_size); + break; + default: + CERROR("Bad message type: %d\n", msg->ptlm_type); + return -EPROTO; + } + } + + /* + * Src nid can not be ANY + */ + if (msg->ptlm_srcnid == PTL_NID_ANY) { + CERROR("Bad src nid: "LPX64"\n", msg->ptlm_srcnid); + return -EPROTO; + } + + return 0; +} + + + +int +kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct portal_ioctl_data *data = arg; + int rc = -EINVAL; + kptl_data_t *kptllnd_data = ni->ni_data; + + PJK_UT_MSG(">>> kptllnd_ctl cmd=%u arg=%p\n",cmd,arg); + + /* + * Validate that the context block is actually + * pointing to this interface + */ + LASSERT (ni == kptllnd_data->kptl_ni); + + switch(cmd) { + case IOC_PORTAL_DEL_PEER: { + rc = kptllnd_peer_del (kptllnd_data,data->ioc_nid); + break; + } + /* + * Not Supported - This is Legacy stuff + case IOC_PORTAL_GET_PEER: + case IOC_PORTAL_ADD_PEER: + case IOC_PORTAL_GET_CONN: + case IOC_PORTAL_CLOSE_CONNECTION: + case IOC_PORTAL_REGISTER_MYNID: + */ + default: + CERROR("Unsupported IOCTL command %d\n",cmd); + rc=-EINVAL; + break; + } + PJK_UT_MSG("<<< kptllnd_ctl rc=%d\n",rc); + return rc; +} + +void +kptllnd_posted_object_setup( + kptl_posted_object_t* posted_obj, + kptl_data_t *kptllnd_data, + int type) +{ + /* + * Setup back pointer to LND instance data + */ + posted_obj->po_kptllnd_data = kptllnd_data; + + /* + * Setup descriptor type + */ + posted_obj->po_flags.pof_type = type; +} + +int +kptllnd_startup (lnet_ni_t *ni) +{ + int rc; + int i; + struct timeval tv; + kptl_data_t *kptllnd_data; + ptl_err_t ptl_rc; + + + PJK_UT_MSG(">>>\n"); + + LASSERT (ni->ni_lnd == &kptllnd_lnd); + + PORTAL_ALLOC (kptllnd_data,sizeof(*kptllnd_data)); + if (kptllnd_data == NULL){ + CERROR ("Failed to allocate memory for PTLLND context\n"); + return -ENOMEM; + } + + /* + * zero pointers, flags etc + * put everything into a known state. + */ + memset (kptllnd_data, 0, sizeof (*kptllnd_data)); + kptllnd_data->kptl_eqh = PTL_INVALID_HANDLE; + kptllnd_data->kptl_nih = PTL_INVALID_HANDLE; + + /* + * Uptick the module reference count + */ + PORTAL_MODULE_USE; + + /* + * Setup pointers between the ni and context data block + */ + kptllnd_data->kptl_ni = ni; + ni->ni_data = kptllnd_data; + + /* + * Setup Credits + */ + ni->ni_maxtxcredits = *kptllnd_tunables.kptl_credits; + ni->ni_peertxcredits = *kptllnd_tunables.kptl_peercredits; + + + /* + * Initialize the Network interface instance + * We use the default because we don't have any + * way to choose a better interface. + * Requested and actual limits are ignored. + */ + ptl_rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL, + &kptllnd_data->kptl_nih); + + /* + * Note: PTL_IFACE_DUP simply means that the requested + * interface was already inited and that we're sharing it. + * Which is ok. + */ + if (ptl_rc != PTL_OK && ptl_rc != PTL_IFACE_DUP){ + CERROR ("PtlNIInit: error %d\n", ptl_rc); + rc = -EINVAL; + goto failed; + } + + ptl_rc = PtlEQAlloc( + kptllnd_data->kptl_nih, + 8, /* We use callback - no need for max */ + kptllnd_eq_callback, /* handler callback */ + &kptllnd_data->kptl_eqh); /* output handle */ + if(ptl_rc != 0) { + CERROR("PtlEQAlloc failed %d\n",ptl_rc); + rc = -ENOMEM; + goto failed; + } + + /* + * Fetch the lower NID + */ + if(ptl_rc != PtlGetId(kptllnd_data->kptl_nih,&kptllnd_data->kptl_portals_id)){ + CERROR ("PtlGetID: error %d\n", ptl_rc); + rc = -EINVAL; + goto failed; + } + + PJK_UT_MSG("lnet nid=" LPX64 " (passed in)\n",ni->ni_nid); + + /* + * Create the new NID. Based on the LND network type + * and the lower ni's address data. + */ + ni->ni_nid = ptl2lnetnid(kptllnd_data,kptllnd_data->kptl_portals_id.nid); + + PJK_UT_MSG("ptl nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid); + PJK_UT_MSG("lnet nid=" LPX64 " (passed back)\n",ni->ni_nid); + + CDEBUG(D_INFO,"ptl nid=" LPX64 "\n",kptllnd_data->kptl_portals_id.nid); + CDEBUG(D_INFO,"lnet nid=" LPX64 "\n",ni->ni_nid); + + /* + * Initialized the incarnation + */ + do_gettimeofday(&tv); + kptllnd_data->kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + PJK_UT_MSG("Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation); + CDEBUG(D_INFO,"Incarnation=" LPX64 "\n",kptllnd_data->kptl_incarnation); + + /* + * Setup the sched locks/lists/waitq + */ + spin_lock_init (&kptllnd_data->kptl_sched_lock); + init_waitqueue_head (&kptllnd_data->kptl_sched_waitq); + INIT_LIST_HEAD (&kptllnd_data->kptl_sched_txq); + INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxq); + INIT_LIST_HEAD (&kptllnd_data->kptl_sched_rxbq); + + /* + * Setup the tx locks/lists/waitq + */ + spin_lock_init (&kptllnd_data->kptl_tx_lock); + INIT_LIST_HEAD (&kptllnd_data->kptl_idle_txs); + INIT_LIST_HEAD (&kptllnd_data->kptl_idle_nblk_txs); + init_waitqueue_head(&kptllnd_data->kptl_idle_tx_waitq); + + /* + * Allocate and setup the peer hash table + */ + PJK_UT_MSG("Allocate Peer Hash Table\n"); + rwlock_init(&kptllnd_data->kptl_peer_rw_lock); + kptllnd_data->kptl_peer_hash_size = *kptllnd_tunables.kptl_peer_hash_table_size; + INIT_LIST_HEAD(&kptllnd_data->kptl_canceled_peers); + PORTAL_ALLOC (kptllnd_data->kptl_peers, + sizeof (struct list_head) * kptllnd_data->kptl_peer_hash_size); + if (kptllnd_data->kptl_peers == NULL) { + CERROR("Failed to allocate space for peer hash table size=%d\n", + kptllnd_data->kptl_peer_hash_size); + rc = -ENOMEM; + goto failed; + } + for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++) + INIT_LIST_HEAD(&kptllnd_data->kptl_peers[i]); + + /* lists/ptrs/locks initialised */ + kptllnd_data->kptl_init = PTLLND_INIT_DATA; + + /*****************************************************/ + + /* + * Start the scheduler threads for handling incoming + * requests. No need to advance the state because + * this will be automatically cleaned up now that PTLNAT_INIT_DATA + * state has been entered + */ + PJK_UT_MSG("starting %d scheduler threads\n",PTLLND_N_SCHED); + for (i = 0; i < PTLLND_N_SCHED; i++) { + rc = kptllnd_thread_start ( + kptllnd_scheduler, + i, + kptllnd_data); + if (rc != 0) { + CERROR("Can't spawn scheduler[%d]: %d\n", i, rc); + goto failed; + } + } + + /* + * Allocate space for the tx descriptors + * (Note we don't need to advance the init state + * because we'll use the pointer being NULL as a sentry + * to know that we have to clean this up + */ + PJK_UT_MSG("Allocate TX Descriptor array\n"); + PORTAL_ALLOC (kptllnd_data->kptl_tx_descs, + PTLLND_TX_MSGS() * sizeof(kptl_tx_t)); + if (kptllnd_data->kptl_tx_descs == NULL){ + CERROR ("Can't allocate space for TX Descriptor array count=%d\n", + PTLLND_TX_MSGS()); + rc = -ENOMEM; + goto failed; + } + + /* + * Now setup the tx descriptors + */ + rc = kptllnd_setup_tx_descs(kptllnd_data); + if (rc != 0) { + CERROR ("Can\'t setup tx descs: %d\n", rc); + goto failed; + } + + /* flag TX descs initialised */ + kptllnd_data->kptl_init = PTLLND_INIT_TXD; + + /*****************************************************/ + + + kptllnd_rx_buffer_pool_init(&kptllnd_data->kptl_rx_buffer_pool); + + /* flag rx descs initialised */ + kptllnd_data->kptl_init = PTLLND_INIT_RXD; + + /*****************************************************/ + + + kptllnd_data->kptl_rx_cache = cfs_mem_cache_create ( + "ptllnd_rx", + sizeof(kptl_rx_t) + *kptllnd_tunables.kptl_max_immd_size, + 0, /* offset */ + 0, /* flags */ + NULL,NULL); /* CTOR/DTOR */ + if( kptllnd_data->kptl_rx_cache == 0 ){ + CERROR("Can't create slab for RX descriptrs\n"); + goto failed; + } + + rc = kptllnd_rx_buffer_pool_reserve( + &kptllnd_data->kptl_rx_buffer_pool, + kptllnd_data, + *kptllnd_tunables.kptl_concurrent_peers); + if( rc != 0) { + CERROR("Can't reserve RX Buffer pool: %d\n",rc); + goto failed; + } + + /* flag everything initialised */ + kptllnd_data->kptl_init = PTLLND_INIT_ALL; + + + /*****************************************************/ + + PJK_UT_MSG("<<< kptllnd_startup SUCCESS\n"); + return 0; + + failed: + CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n",rc); + kptllnd_shutdown (ni); + PJK_UT_MSG("<<< kptllnd_startup rc=%d\n",rc); + return rc; +} + +void +kptllnd_shutdown (lnet_ni_t *ni) +{ + int i; + kptl_data_t *kptllnd_data = ni->ni_data; + + PJK_UT_MSG(">>> kptllnd_shutdown\n"); + + /* + * Validate that the context block is actually + * pointing to this interface + */ + LASSERT (ni == kptllnd_data->kptl_ni); + + CDEBUG(D_MALLOC, "before LND cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + /* + * Now depending on where we are in the initialization + * cleanup the context block + */ + switch (kptllnd_data->kptl_init) { + + case PTLLND_INIT_ALL: + case PTLLND_INIT_RXD: + PJK_UT_MSG("PTLLND_INIT_RXD\n"); + + kptllnd_rx_buffer_pool_fini( + &kptllnd_data->kptl_rx_buffer_pool); + + LASSERT(list_empty(&kptllnd_data->kptl_sched_rxq)); + LASSERT(list_empty(&kptllnd_data->kptl_sched_rxbq)); + + /* fall through */ + case PTLLND_INIT_TXD: + PJK_UT_MSG("PTLLND_INIT_TXD\n"); + + /* + * If there were peers started up then + * clean them up. + */ + if( atomic_read(&kptllnd_data->kptl_npeers) != 0) { + PJK_UT_MSG("Deleting %d peers\n",atomic_read(&kptllnd_data->kptl_npeers)); + + /* nuke all peers */ + kptllnd_peer_del(kptllnd_data,PTL_NID_ANY); + + i = 2; + while (atomic_read (&kptllnd_data->kptl_npeers) != 0) { + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d peers to terminate\n", + atomic_read (&kptllnd_data->kptl_npeers)); + PJK_UT_MSG("Waiting for %d peers to terminate\n", + atomic_read (&kptllnd_data->kptl_npeers)); + cfs_pause(cfs_time_seconds(1)); + } + } + + LASSERT(list_empty(&kptllnd_data->kptl_canceled_peers)); + PJK_UT_MSG("All peers deleted\n"); + + /* + * Set the shutdown flag + */ + kptllnd_data->kptl_shutdown = 1; + + /* + * First thing we do is shutdown the scheduler threads + * It makes cleanup easier to not have to worry about races + * with N other threads. + * + * Also this is safe no matter the kptl_init state + * because it is a nop because kptl_nthreads==0 + * if we are not in the right state. + */ + if(atomic_read (&kptllnd_data->kptl_nthreads) != 0){ + PJK_UT_MSG("Stopping %d threads\n",atomic_read(&kptllnd_data->kptl_nthreads)); + /* + * Wake up all the schedulers + */ + wake_up_all (&kptllnd_data->kptl_sched_waitq); + + i = 2; + while (atomic_read (&kptllnd_data->kptl_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read (&kptllnd_data->kptl_nthreads)); + PJK_UT_MSG("Waiting for %d threads to terminate\n", + atomic_read (&kptllnd_data->kptl_nthreads)); + cfs_pause(cfs_time_seconds(1)); + } + + } + PJK_UT_MSG("All Threads stopped\n"); + + + LASSERT(list_empty(&kptllnd_data->kptl_sched_txq)); + + kptllnd_cleanup_tx_descs(kptllnd_data); + + /* fall through */ + case PTLLND_INIT_DATA: + + PJK_UT_MSG("PTLLND_INIT_DATA\n"); + + LASSERT (atomic_read(&kptllnd_data->kptl_npeers) == 0); + LASSERT (kptllnd_data->kptl_peers != NULL); + for (i = 0; i < kptllnd_data->kptl_peer_hash_size; i++) { + LASSERT (list_empty (&kptllnd_data->kptl_peers[i])); + } + /* + * Nothing here now, but libcfs might soon require + * us to explicitly destroy wait queues and semaphores + * that would be done here + */ + + /* fall through */ + + case PTLLND_INIT_NOTHING: + PJK_UT_MSG("PTLLND_INIT_NOTHING\n"); + break; + } + + /* + * There are a number of things that can be done + * outside the state machine, because the construction + * (or lack thereof) can be determined directly from + * the pointer or handle itself. + * Clean these things up here + */ + + /* + * Cleanup the portals EQ + */ + if(!PtlHandleIsEqual(kptllnd_data->kptl_eqh,PTL_INVALID_HANDLE)) + PtlEQFree(kptllnd_data->kptl_eqh); + + /* + * release the portals ni handle + */ + if(!PtlHandleIsEqual(kptllnd_data->kptl_nih,PTL_INVALID_HANDLE)) + PtlNIFini(kptllnd_data->kptl_nih); + + /* + * Free the tx descriptors + */ + if (kptllnd_data->kptl_tx_descs != NULL) + PORTAL_FREE(kptllnd_data->kptl_tx_descs, + PTLLND_TX_MSGS() * sizeof(kptl_tx_t)); + + /* + * Cleanup the RX descriptor slab + */ + if (kptllnd_data->kptl_rx_cache != NULL) + cfs_mem_cache_destroy( kptllnd_data->kptl_rx_cache); + + /* + * Cleanup the peer hash table + */ + if (kptllnd_data->kptl_peers != NULL){ + PORTAL_FREE (kptllnd_data->kptl_peers, + sizeof (struct list_head) * + kptllnd_data->kptl_peer_hash_size); + } + + /* + * And free the context block + */ + PORTAL_FREE(kptllnd_data,sizeof(*kptllnd_data)); + + CDEBUG(D_MALLOC, "after LND cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + PORTAL_MODULE_UNUSE; + PJK_UT_MSG("<<<\n"); +} + +int __init +kptllnd_module_init (void) +{ + int rc; + + PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__); + + /* + * Display the module parameters + */ + CDEBUG(D_INFO,"ntx = %d\n",*kptllnd_tunables.kptl_ntx); + CDEBUG(D_INFO,"ntx_nblk = %d\n",*kptllnd_tunables.kptl_ntx_nblk); + CDEBUG(D_INFO,"concurrent_peers = %d\n",*kptllnd_tunables.kptl_concurrent_peers); + CDEBUG(D_INFO,"cksum = %d\n",*kptllnd_tunables.kptl_cksum); + CDEBUG(D_INFO,"portal = %d\n",*kptllnd_tunables.kptl_portal); + CDEBUG(D_INFO,"timeout = %d (seconds)\n",*kptllnd_tunables.kptl_timeout); + CDEBUG(D_INFO,"rxb_npages = %d\n",*kptllnd_tunables.kptl_rxb_npages); + CDEBUG(D_INFO,"credits = %d\n",*kptllnd_tunables.kptl_credits); + CDEBUG(D_INFO,"peercredits = %d\n",*kptllnd_tunables.kptl_peercredits); + CDEBUG(D_INFO,"max_immd_size = %d\n",*kptllnd_tunables.kptl_max_immd_size); + + ptllnd_assert_wire_constants(); + + rc = kptllnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&kptllnd_lnd); + + PJK_UT_MSG("<<<\n"); + return 0; +} + void __exit kptllnd_module_fini (void) { + PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__); - PtlNIFini(nih); + lnet_unregister_lnd(&kptllnd_lnd); + kptllnd_tunables_fini(); + kpttllnd_get_stats(); PJK_UT_MSG("<<<\n"); } -int __init -kptllnd_module_init (void) +#define DO_TYPE(x) case x: return #x; + +const char *get_ev_type_string(int type) { - int rc = 0; - lnet_process_id_t portals_id; - PJK_UT_MSG(">>> %s %s\n",__DATE__,__TIME__); - - PJK_UT_MSG("PtlNIInit\n"); - rc = PtlNIInit(PTL_IFACE_DEFAULT, 0, NULL, NULL, &nih); - if (rc != PTL_OK && rc != PTL_IFACE_DUP){ - /*CERROR ("PtlNIInit: error %d\n", rc);*/ - goto failed; + switch(type) + { + DO_TYPE(PTL_EVENT_GET_START); + DO_TYPE(PTL_EVENT_GET_END); + DO_TYPE(PTL_EVENT_PUT_START); + DO_TYPE(PTL_EVENT_PUT_END); + DO_TYPE(PTL_EVENT_REPLY_START); + DO_TYPE(PTL_EVENT_REPLY_END); + DO_TYPE(PTL_EVENT_ACK); + DO_TYPE(PTL_EVENT_SEND_START); + DO_TYPE(PTL_EVENT_SEND_END); + DO_TYPE(PTL_EVENT_UNLINK); + default: + return ""; } - - PJK_UT_MSG("PtlGetId\n"); - if(rc != PtlGetId(nih,&portals_id)){ - /*CERROR ("PtlGetID: error %d\n", rc);*/ - }else{ - PJK_UT_MSG("ptl nid=" LPX64 "\n",portals_id.nid); - } - -failed: - PJK_UT_MSG("<<<\n"); - return rc; -} - +} + +const char *get_msg_type_string(int type) +{ + switch(type) + { + DO_TYPE(PTLLND_MSG_TYPE_INVALID); + DO_TYPE(PLTLND_MSG_TYPE_PUT); + DO_TYPE(PTLLND_MSG_TYPE_GET); + DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE); + DO_TYPE(PTLLND_MSG_TYPE_HELLO); + default: + return ""; + } +} + +#define LOGSTAT(x) PJK_UT_MSG_ALWAYS("%30.30s %d\n",#x,kptllnd_stats.x); + +kptl_stats_t* kpttllnd_get_stats(void) +{ + LOGSTAT(kps_incoming_checksums_calculated); + LOGSTAT(kps_incoming_checksums_invalid); + LOGSTAT(kps_cleaning_caneled_peers); + LOGSTAT(kps_checking_buckets); + LOGSTAT(kps_too_many_peers); + LOGSTAT(kps_peers_created); + LOGSTAT(kps_no_credits); + LOGSTAT(kps_saving_last_credit); + LOGSTAT(kps_rx_allocated); + LOGSTAT(kps_rx_released); + LOGSTAT(kps_rx_allocation_failed); + LOGSTAT(kps_tx_allocated); + LOGSTAT(kps_tx_released); + LOGSTAT(kpt_tx_allocation_failed); + LOGSTAT(kpx_recv_delayed); + LOGSTAT(kpx_send_routing); + LOGSTAT(kpx_send_target_is_router); + + return &kptllnd_stats; +} + MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Portals LND v1.00"); -/*MODULE_LICENSE("GPL");*/ +MODULE_DESCRIPTION("Kernel Portals LND v1.00"); +MODULE_LICENSE("GPL"); module_init(kptllnd_module_init); module_exit(kptllnd_module_fini); diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 4c302d0..2469d4c 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -47,12 +47,673 @@ #include #include #include -#include - +#include + +#include +#include + + +#define DEBUG_SUBSYSTEM S_NAL + #include #include -#include - +#include #include - -#define PJK_UT_MSG(fmt...) do{printk("<1>ptllnd: %-30s ",__FUNCTION__);printk(fmt);}while(0) + +/* + * The PTLLND was designed to support Portals with + * Lustre and non-lustre UNLINK semantics. + * However for now the two targets are Cray Portals + * on the XT3 and Lustre Portals (for testing) both + * have Lustre UNLINK semantics, so this is defined + * by default. + */ +#define LUSTRE_PORTALS_UNLINK_SEMANTICS + + +/* + * Define this to enable console debug logging + * and simulation + */ +//#define PJK_DEBUGGING + +/* + * This was used for some single node testing + * which has some hacks to allow packets that come + * back on the lookback LND to have their address + * fixed up, so that match MD's properly. And you + * can setup a connection with your self and transfer data. + * WARNING: This was for UNIT testing purposes only. + */ +//#define TESTING_WITH_LOOPBACK + + + +#define PTL_RESERVED_MATCHBITS 0x100 /* below this value is reserved + * above is for bult data transfer */ +#define LNET_MSG_MATCHBITS 0 /* the value for the message channel */ + +#if CONFIG_SMP +# define PTLLND_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define PTLLND_N_SCHED 1 /* # schedulers */ +#endif + + + +/* defaults for modparams/tunables */ +#define PTLLND_NTX 32 /* # tx descs */ +#define PTLLND_NTX_NBLK 256 /* # reserved tx descs */ +#define PTLLND_NRX (64 * num_online_cpus()) /* # rx desc */ +#define PTLLND_CONCURRENT_PEERS 1152 /* # nodes all talking at once to me */ +#define PTLLND_CKSUM 0 /* checksum kptl_msg_t? 0 = Diabled */ +#define PTLLND_TIMEOUT 50 /* default comms timeout (seconds) */ +#define PTLLND_PORTAL 9 /* The same portal PTLPRC used when talking to cray portals */ +#define PTLLND_RXB_NPAGES 1 /* Number of pages for a single RX Buffer */ +#define PTLLND_CREDITS 256 /* concurrent sends */ +#define PTLLND_PEERCREDITS 8 /* concurrent sends to 1 peer*/ +#define PTLLND_MAX_MSG_SIZE 512 /* Maximum message size */ +#define PTLLND_PEER_HASH_SIZE 101 /* # of buckets in peer hash table */ + +/* tunables fixed at compile time */ +#define PTLLND_CREDIT_HIGHWATER (*kptllnd_tunables.kptl_peercredits-1) /* when to eagerly return credits */ +#define PTLLND_TIMEOUT_SEC 3 /* How often we check a subset of the peer hash table for timeout*/ + +/************************/ +/* derived constants... */ +/* TX messages (shared by all connections) */ +#define PTLLND_TX_MSGS() (*kptllnd_tunables.kptl_ntx + \ + *kptllnd_tunables.kptl_ntx_nblk) + + +typedef struct +{ + int *kptl_ntx; /* # tx descs */ + int *kptl_ntx_nblk; /* # reserved tx descs */ + int *kptl_concurrent_peers; /* max # nodes all talking to me */ + int *kptl_cksum; /* checksum kptl_msg_t? */ + int *kptl_timeout; /* comms timeout (seconds) */ + int *kptl_portal; /* portal number */ + int *kptl_rxb_npages; /* number of pages for rx buffer */ + int *kptl_credits; /* number of credits */ + int *kptl_peercredits; /* number of credits */ + int *kptl_max_immd_size; /* max immd message size*/ + int *kptl_peer_hash_table_size; /* # slots in peer hash table */ + +#ifdef PJK_DEBUGGING + int *kptl_simulation_bitmap;/* simulation bitmap */ +#endif + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + struct ctl_table_header *kptl_sysctl; /* sysctl interface */ +#endif +} kptl_tunables_t; + + + +#include "ptllnd_wire.h" + +/***********************************************************************/ + +typedef struct kptl_data kptl_data_t; +typedef struct kptl_rx_buffer kptl_rx_buffer_t; +typedef struct kptl_peer kptl_peer_t; + +#define POSTED_OBJECT_TYPE_RESERVED 0 +#define POSTED_OBJECT_TYPE_TX 1 +#define POSTED_OBJECT_TYPE_RXB 2 + +typedef struct +{ + __u32 pof_type : 2; +}kptl_posted_object_flags_t; + +typedef struct kptl_posted_object +{ + kptl_data_t *po_kptllnd_data; /* LND Instance Data */ + kptl_posted_object_flags_t po_flags; /* flags and state */ +} kptl_posted_object_t; + +typedef struct kptl_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + atomic_t rx_refcount; + kptl_rx_buffer_t *rx_rxb; /* the rx buffer pointer */ + kptl_msg_t *rx_msg; + int rx_nob; /* the number of bytes rcvd */ + ptl_process_id_t rx_initiator; /* who send the packet */ + kptl_peer_t *rx_peer; /* pointer to peer */ + size_t rx_payload[0]; /* payload QQQ*/ +} kptl_rx_t; + +typedef struct kptl_rx_buffer_pool +{ + spinlock_t rxbp_lock; + struct list_head rxbp_list; + int rxbp_count; /* the number of elements in the list */ + int rxbp_reserved; /* the number currently reserved */ + int rxbp_shutdown; /* the shutdown flag for the pool */ + int rxbp_posted; /* the number of elements posted */ +}kptl_rx_buffer_pool_t; + +typedef enum +{ + RXB_STATE_UNINITIALIZED = 0, + RXB_STATE_IDLE = 1, + RXB_STATE_POSTED = 2, +}kptl_rxb_state_t; + +struct kptl_rx_buffer +{ + /* NB - becuase this buffer is assigned to a MD's usr_ptr + * It MUST have kptl_posted_object_t as the first member + * so that the real type of the element can be determined + */ + kptl_posted_object_t rxb_po; + kptl_rx_buffer_pool_t *rxb_pool; + struct list_head rxb_list; /* for the rxb_pool list */ + struct list_head rxb_repost_list;/* for the kptl_sched_rxbq list*/ + kptl_rxb_state_t rxb_state; /* the state of this rx buffer*/ + atomic_t rxb_refcount; /* outstanding rx */ + ptl_handle_md_t rxb_mdh; /* the portals memory descriptor (MD) handle */ + void *rxb_buffer; /* the buffer */ + +}; + +typedef enum +{ + TX_STATE_UNINITIALIZED = 0, + TX_STATE_ON_IDLE_QUEUE = 1, + TX_STATE_ALLOCATED = 2, + TX_STATE_WAITING_CREDITS = 3, + TX_STATE_WAITING_RESPONSE = 4 +}kptl_tx_state_t; + +typedef enum +{ + TX_TYPE_RESERVED = 0, + TX_TYPE_SMALL_MESSAGE = 1, + TX_TYPE_LARGE_PUT = 2, + TX_TYPE_LARGE_GET = 3, + TX_TYPE_LARGE_PUT_RESPONSE = 4, + TX_TYPE_LARGE_GET_RESPONSE = 5, +}kptl_tx_type_t; + +typedef struct kptl_tx /* transmit message */ +{ + /* NB - becuase this buffer is assigned to a MD's usr_ptr + * It MUST have kptl_posted_object_t as the first member + * so that the real type of the element can be determined + */ + kptl_posted_object_t tx_po; + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + struct list_head tx_schedlist; /* queue on idle_txs ibc_tx_queue etc. */ + atomic_t tx_refcount; /* Posted Buffer refrences count*/ + kptl_tx_state_t tx_state; /* the state of this tx descriptor */ + int tx_seen_send_end; /* if we've seen a SEND_END event */ + int tx_seen_reply_end; /* if we've seen a REPLY_END event */ + kptl_tx_type_t tx_type; /* type of transfer */ + int tx_status; /* the status of this tx descriptor */ + int tx_isnblk; /* I'm reserved for non-blocking sends */ + ptl_handle_md_t tx_mdh; /* the portals memory descriptor (MD) handle */ + ptl_handle_md_t tx_mdh_msg; /* the portals MD handle for the initial message */ + lnet_msg_t *tx_ptlmsg; /* the cookie for finalize */ + lnet_msg_t *tx_ptlmsg_reply; /* the cookie for the reply message */ + kptl_msg_t *tx_msg; /* the message data */ + kptl_peer_t *tx_peer; /* the peer this is waiting on */ + unsigned long tx_deadline; /* deadline */ + kptl_rx_t *tx_associated_rx; /* Associated RX for Bulk RDMA */ + + unsigned int tx_payload_niov; + struct iovec *tx_payload_iov; + lnet_kiov_t *tx_payload_kiov; + unsigned int tx_payload_offset; + int tx_payload_nob; +} kptl_tx_t; + + +typedef enum +{ + PEER_STATE_UNINITIALIZED = 0, + PEER_STATE_WAITING_HELLO = 1, + PEER_STATE_ACTIVE = 2, + PEER_STATE_CANCELED = 3, +}kptllnd_peer_state_t; + +struct kptl_peer +{ + struct list_head peer_list; + atomic_t peer_refcount; /* The current refrences */ + kptllnd_peer_state_t peer_state; + kptl_data_t *peer_kptllnd_data; /* LND Instance Data */ + spinlock_t peer_lock; /* serialize */ + struct list_head peer_pending_txs; /* queue of pending txs */ + struct list_head peer_active_txs; /* queue of activce txs */ + int peer_active_txs_change_counter;/* updated when peer_active_txs changes*/ + lnet_nid_t peer_nid; /* who's on the other end(s) */ + __u64 peer_incarnation; /* peer's incarnation */ + __u64 peer_tx_seqnum; /* next seq# to send with*/ + int peer_credits; /* number of send credits */ + int peer_outstanding_credits;/* number of peer credits */ + __u64 peer_next_matchbits; /* Next value to use for tx desc matchbits */ + __u64 peer_last_matchbits_seen; /* last matchbits seen*/ +}; + + + +struct kptl_data +{ + int kptl_init; /* initialisation state */ + volatile int kptl_shutdown; /* shut down? */ + atomic_t kptl_nthreads; /* # live threads */ + lnet_ni_t *kptl_ni; /* _the_ LND instance */ + ptl_handle_ni_t kptl_nih; /* network inteface handle */ + ptl_process_id_t kptl_portals_id; /* Portals ID of interface */ + __u64 kptl_incarnation; /* which one am I */ + ptl_handle_eq_t kptl_eqh; /* Event Queue (EQ) */ + + spinlock_t kptl_sched_lock; /* serialise the next 3 members*/ + wait_queue_head_t kptl_sched_waitq; /* schedulers sleep here */ + struct list_head kptl_sched_txq; /* tx requiring attention */ + struct list_head kptl_sched_rxq; /* rx requiring attention */ + struct list_head kptl_sched_rxbq; /* rxb requiring reposting */ + + kptl_rx_buffer_pool_t kptl_rx_buffer_pool; /* rx buffer pool */ + cfs_mem_cache_t* kptl_rx_cache; /* rx descripter cache */ + + struct kptl_tx *kptl_tx_descs; /* the tx descriptors array */ + spinlock_t kptl_tx_lock; /* serialise the next 4 members*/ + struct list_head kptl_idle_txs; /* idle tx descriptors */ + struct list_head kptl_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kptl_idle_tx_waitq; /* block here for tx descriptor */ + + rwlock_t kptl_peer_rw_lock; /* lock for peer table */ + struct list_head *kptl_peers; /* hash table of all my known peers */ + struct list_head kptl_canceled_peers; /* peers in the canceld state */ + int kptl_canceled_peers_counter; /* updated when canceled_peers is modified*/ + int kptl_peer_hash_size; /* size of kptl_peers */ + atomic_t kptl_npeers; /* # peers extant */ + +}; + +typedef struct kptl_stats +{ + int kps_incoming_checksums_calculated; + int kps_incoming_checksums_invalid; + int kps_cleaning_caneled_peers; /* MP Safe*/ + int kps_checking_buckets; + int kps_too_many_peers; /* MP Safe*/ + int kps_peers_created; /* MP Safe*/ + int kps_no_credits; + int kps_saving_last_credit; + int kps_rx_allocated; + int kps_rx_released; + int kps_rx_allocation_failed; + int kps_tx_allocated; /* MP Safe*/ + int kps_tx_released; /* MP Safe*/ + int kpt_tx_allocation_failed; /* MP Safe*/ + int kpx_recv_delayed; + int kpx_send_routing; + int kpx_send_target_is_router; +}kptl_stats_t; + +/* + * Note: Stats update are not atomic (for performance reasons) + * and therefore not MP safe. They are more an indiciation of + * things that are going on, as opposed to a actual count. + * + * (e.g. if kps_checking_buckets wasn't incrementing at some + * number per second, that would be an indication that the + * scheduler thread is stuck or stopped) + * + * However where possible the update of the stats are placed inside + * a spinlock to make them consistent, these are marked MP Safe above. + * + */ +#define STAT_UPDATE(n) do{ ++kptllnd_stats.n; }while(0) + + +enum +{ + PTLLND_INIT_NOTHING = 0, + PTLLND_INIT_DATA = 1, + PTLLND_INIT_TXD = 2, + PTLLND_INIT_RXD = 3, + PTLLND_INIT_ALL = 4, +}; + + +extern kptl_tunables_t kptllnd_tunables; +extern kptl_stats_t kptllnd_stats; + +int kptllnd_startup ( + lnet_ni_t *ni); + +void kptllnd_shutdown ( + lnet_ni_t *ni); + +int kptllnd_ctl( + lnet_ni_t *ni, + unsigned int cmd, + void *arg); + +int kptllnd_send ( + lnet_ni_t *ni, + void *private, + lnet_msg_t *lntmsg); + +int kptllnd_recv ( + lnet_ni_t *ni, + void *private, + lnet_msg_t *lntmsg, + int delayed, + unsigned int niov, + struct iovec *iov, + lnet_kiov_t *kiov, + unsigned int offset, + unsigned int mlen, + unsigned int rlen); + +void kptllnd_eq_callback( + ptl_event_t *evp); + +int kptllnd_scheduler( + void *arg); + +int kptllnd_thread_start( + int (*fn)(void *arg), + int id, + kptl_data_t *kptllnd_data); + +int kptllnd_tunables_init(void); +void kptllnd_tunables_fini(void); + +const char *get_ev_type_string( + int evtype); + +const char *get_msg_type_string( + int type); + +kptl_stats_t* kpttllnd_get_stats(void); + +void +kptllnd_posted_object_setup( + kptl_posted_object_t* posted_obj, + kptl_data_t *kptllnd_data, + int type); + +/* + * RX BUFFER SUPPORT FUNCTIONS + */ + +void +kptllnd_rx_buffer_pool_init( + kptl_rx_buffer_pool_t *rxbp); + +void +kptllnd_rx_buffer_pool_fini( + kptl_rx_buffer_pool_t *rxbp); + +int +kptllnd_rx_buffer_pool_reserve( + kptl_rx_buffer_pool_t *rxbp, + kptl_data_t *kptllnd_data, + int count); + +void +kptllnd_rx_buffer_pool_unreserve( + kptl_rx_buffer_pool_t *rxbp, + int count); + +void +kptllnd_rx_buffer_callback( + ptl_event_t *ev); + +void +kptllnd_rx_buffer_scheduled_post( + kptl_rx_buffer_t *rxb); + +void +kptllnd_rx_buffer_post_handle_error( + kptl_rx_buffer_t *rxb); + +void +kptllnd_rx_buffer_decref( + kptl_rx_buffer_t *rxb, + const char *owner); + +/* + * RX SUPPORT FUNCTIONS + */ +void +kptllnd_rx_scheduler_handler( + kptl_rx_t *rx); + +void +kptllnd_rx_addref( + kptl_rx_t *rx, + const char *owner); + +void +kptllnd_rx_decref( + kptl_rx_t *rx, + const char *owner, + kptl_data_t *kptllnd_data); + +/* + * PEER SUPPORT FUNCTIONS + */ +void +kptllnd_peer_decref ( + kptl_peer_t *peer, + const char *owner); +void +kptllnd_peer_addref ( + kptl_peer_t *peer, + const char *owner); + +int +kptllnd_peer_del ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid); + +void +kptllnd_peer_cancel( + kptl_peer_t *peer); + +void +kptllnd_peer_queue_tx ( + kptl_peer_t *peer, + kptl_tx_t *tx); + +void +kptllnd_peer_queue_bulk_rdma_tx_locked( + kptl_peer_t *peer, + kptl_tx_t *tx); + +void +kptllnd_peer_dequeue_tx( + kptl_peer_t *peer, + kptl_tx_t *tx); +void +kptllnd_peer_dequeue_tx_locked( + kptl_peer_t *peer, + kptl_tx_t *tx); + +int +kptllnd_peer_connect ( + kptl_tx_t *tx, + lnet_nid_t nid ); + +void +kptllnd_peer_check_sends ( + kptl_peer_t *peer ); +void +kptllnd_peer_check_bucket ( + int idx, + kptl_data_t *kptllnd_data); + +void +kptllnd_tx_launch ( + kptl_tx_t *tx, + lnet_nid_t target_nid, + lnet_msg_t *ptlmsg ); + +kptl_peer_t * +kptllnd_peer_find ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid); + +kptl_peer_t * +kptllnd_peer_handle_hello ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid, + kptl_msg_t *msg); + +static inline struct list_head * +kptllnd_nid2peerlist (kptl_data_t *kptllnd_data,lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kptllnd_data->kptl_peer_hash_size; + + return (&kptllnd_data->kptl_peers [hash]); +} + +/* + * TX SUPPORT FUNCTIONS + */ +int +kptllnd_setup_tx_descs ( + kptl_data_t *kptllnd_data); + +void +kptllnd_cleanup_tx_descs( + kptl_data_t *kptllnd_data); + +void +kptllnd_tx_addref( + kptl_tx_t *tx); +void +kptllnd_tx_decref( + kptl_tx_t *tx); +void +kptllnd_tx_scheduled_decref( + kptl_tx_t *tx); +void +kptllnd_tx_done ( + kptl_tx_t *tx); +kptl_tx_t * +kptllnd_get_idle_tx( + kptl_data_t *kptllnd_data, + int may_block, + kptl_tx_type_t purpose); + +void +kptllnd_tx_callback( + ptl_event_t *ev); + +/* + * MESSAGE SUPPORT FUNCTIONS + */ +void +kptllnd_init_msg( + kptl_msg_t *msg, + int type, + int body_nob); + +void +kptllnd_msg_pack( + kptl_msg_t *msgp, + int credits, + lnet_nid_t dstnid, + __u64 dststamp, + __u64 seq, + kptl_data_t *kptllnd_data); + +int +kptllnd_msg_unpack( + kptl_msg_t *msg, + int nob, + kptl_data_t *kptllnd_data); + +/* + * MISC SUPPORT FUNCTIONS + */ + +typedef union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; +}tempiov_t; + +void +kptllnd_setup_md( + kptl_data_t *kptllnd_data, + ptl_md_t *md, + unsigned int op, + kptl_tx_t *tx, + unsigned int payload_niov, + struct iovec *payload_iov, + lnet_kiov_t *payload_kiov, + unsigned int payload_offset, + int payload_nob, + tempiov_t *tempiov); + +int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data); +int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data); +int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data); + +static inline lnet_nid_t ptl2lnetnid(kptl_data_t *kptllnd_data,ptl_nid_t portals_nid) +{ + return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_ni->ni_nid), PTL_NIDADDR(portals_nid) ); +} + +static inline ptl_nid_t lnet2ptlnid(kptl_data_t *kptllnd_data,lnet_nid_t lnet_nid) +{ + return PTL_MKNID(PTL_NIDNET(kptllnd_data->kptl_portals_id.nid), PTL_NIDADDR(lnet_nid) ); +} + +#ifdef PJK_DEBUGGING + +#define PJK_UT_MSG_ALWAYS(fmt, a...) \ +do{ \ + printk("<1>ptllnd:%-30s:%u:",__FUNCTION__,cfs_curproc_pid()); \ + printk(fmt,## a); \ + CDEBUG(D_TRACE,fmt,## a); \ +}while(0) + +#define PJK_UT_MSG_SIMULATION(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a ) + + +#if 1 +#define PJK_UT_MSG_DATA(fmt, a...) do{}while(0) +#else +#define PJK_UT_MSG_DATA(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a ) +#endif + +#if 1 +#define PJK_UT_MSG(fmt, a...) do{}while(0) +#else +#define PJK_UT_MSG(fmt, a...) PJK_UT_MSG_ALWAYS(fmt, ## a ) +#endif + + +#define SIMULATION_FAIL_BLOCKING_TX_PUT_ALLOC 0 /* 0x00000001 */ +#define SIMULATION_FAIL_BLOCKING_TX_GET_ALLOC 1 /* 0x00000002 */ +#define SIMULATION_FAIL_BLOCKING_TX 2 /* 0x00000004 */ +#define SIMULATION_FAIL_BLOCKING_RX_ALLOC 3 /* 0x00000008 */ + +#define IS_SIMULATION_ENABLED(x) \ + (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0) + + +#else + + +#define PJK_UT_MSG_ALWAYS(fmt, a...) do{}while(0) +#define PJK_UT_MSG_SIMULATION(fmt, a...) do{}while(0) +#define PJK_UT_MSG_DATA(fmt, a...) do{}while(0) +#define PJK_UT_MSG(fmt, a...) do{}while(0) + +#define IS_SIMULATION_ENABLED(x) 0 + +#endif + diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c new file mode 100644 index 0000000..1978dae --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_cb.c @@ -0,0 +1,1052 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * Author: PJ Kirner + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "ptllnd.h" + +void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data); + +void +kptllnd_setup_md( + kptl_data_t *kptllnd_data, + ptl_md_t *md, + unsigned int op, + kptl_tx_t *tx, + unsigned int payload_niov, + struct iovec *payload_iov, + lnet_kiov_t *payload_kiov, + unsigned int payload_offset, + int payload_nob, + tempiov_t *tempiov) +{ + unsigned int niov = 0; + + PJK_UT_MSG_DATA("%s nob=%d offset=%d niov=%d\n", + op == PTL_MD_OP_GET ? "GET" : "PUT", + payload_nob,payload_offset,payload_niov); + + /* One but not both of iov or kiov must be NULL (XOR) */ + LASSERT( (payload_iov != 0 && payload_kiov == 0) || + (payload_iov == 0 && payload_kiov != 0 ) ); + + /* We have a put or get operation*/ + LASSERT( op == PTL_MD_OP_GET || op == PTL_MD_OP_PUT); + + + /* Only one operation then unlink */ + md->threshold = 1; + + /* Get operations need threshold +1 to handle the + * reply operation + */ + if( op == PTL_MD_OP_GET) + md->threshold++; + + /* setup the options*/ + md->options = op; + + /* If this is a PUT then we need to disable ACK */ + /* we don't need an ACK, we'll get a callback when it is complete */ + if( op == PTL_MD_OP_PUT) + md->options |= PTL_MD_ACK_DISABLE; + + /* we don't care about the start event */ + md->options |= PTL_MD_EVENT_START_DISABLE; + + /* point back to this TX descriptor so we know what to complete + * when the event is triggered */ + md->user_ptr = tx; + + md->eq_handle = kptllnd_data->kptl_eqh; + if (payload_iov != NULL){ + + while (payload_offset >= payload_iov->iov_len) { + payload_offset -= payload_iov->iov_len; + payload_iov++; + payload_niov--; + LASSERT (payload_niov > 0); + } + + while(payload_nob){ + LASSERT( payload_offset < payload_iov->iov_len); + LASSERT (payload_niov > 0); + LASSERT (niov < sizeof(tempiov->iov)/sizeof(tempiov->iov[0])); + + tempiov->iov[niov].iov_base = payload_iov->iov_base + payload_offset; + tempiov->iov[niov].iov_len = min((int)(payload_iov->iov_len - payload_offset), + (int)payload_nob); + + payload_offset = 0; + payload_nob -= tempiov->iov[niov].iov_len; + payload_iov++; + payload_niov--; + niov++; + } + + md->start = tempiov->iov; + md->options |= PTL_MD_IOVEC; + + }else{ + + + while (payload_offset >= payload_kiov->kiov_len) { + payload_offset -= payload_kiov->kiov_len; + payload_kiov++; + payload_niov--; + LASSERT (payload_niov > 0); + } + + while(payload_nob){ + LASSERT( payload_offset < payload_kiov->kiov_len); + LASSERT (payload_niov > 0); + LASSERT (niov < sizeof(tempiov->kiov)/sizeof(tempiov->kiov[0])); + + tempiov->kiov[niov].kiov_page = payload_kiov->kiov_page; + tempiov->kiov[niov].kiov_offset = payload_kiov->kiov_offset + payload_offset; + tempiov->kiov[niov].kiov_len = min((int)(payload_kiov->kiov_len - payload_offset), + (int)payload_nob); + + payload_offset = 0; + payload_nob -= tempiov->kiov[niov].kiov_len; + payload_kiov++; + payload_niov--; + niov++; + } + + md->start = tempiov->kiov; + md->options |= PTL_MD_KIOV; + } + + /* + * When using PTL_MD_IOVEC or PTL_MD_KIOV this is not + * length, rather it is # iovs + */ + md->length = niov; +} + +int +kptllnd_start_bulk_rdma( + kptl_data_t *kptllnd_data, + kptl_rx_t *rx, + lnet_msg_t *lntmsg, + unsigned int op, + unsigned int payload_niov, + struct iovec *payload_iov, + lnet_kiov_t *payload_kiov, + unsigned int payload_offset, + int payload_nob) +{ + kptl_tx_t *tx; + ptl_md_t md; + ptl_err_t ptl_rc; + ptl_err_t ptl_rc2; + int rc; + tempiov_t tempiov; + kptl_msg_t *rxmsg = rx->rx_msg; + kptl_peer_t *peer = rx->rx_peer; + + + /* + * Get an idle tx descriptor + * may NOT block: (That's the "0" param) + */ + LASSERT(op == PTL_MD_OP_GET || op == PTL_MD_OP_PUT); + tx = kptllnd_get_idle_tx(kptllnd_data,0, + op == PTL_MD_OP_GET ? TX_TYPE_LARGE_PUT_RESPONSE : + TX_TYPE_LARGE_GET_RESPONSE); + if(tx == NULL){ + CERROR ("Can't start bulk rdma %d to "LPX64": tx descs exhausted\n", + op, rx->rx_initiator.nid); + return -ENOMEM; + } + + /* + * Attach the RX to the TX and take a refrence + */ + tx->tx_associated_rx = rx; + kptllnd_rx_addref(rx,"tx"); + + PJK_UT_MSG_DATA(">>> %s rx=%p associated with tx=%p\n", + op == PTL_MD_OP_GET ? "GET" : "PUT", + rx,tx); + PJK_UT_MSG_DATA("matchibts=" LPX64 "\n", + rxmsg->ptlm_u.req.kptlrm_matchbits); + + /* + * Setup the MD + */ + kptllnd_setup_md(kptllnd_data,&md,op,tx, + payload_niov,payload_iov,payload_kiov, + payload_offset,payload_nob,&tempiov); + + spin_lock(&peer->peer_lock); + + /* + * Attach the MD + */ + ptl_rc = PtlMDBind( + kptllnd_data->kptl_nih, + md, + PTL_UNLINK, + &tx->tx_mdh); + if(ptl_rc != PTL_OK){ + CERROR("PtlMDBind failed %d\n",ptl_rc); + + spin_unlock(&peer->peer_lock); + /* + * Just drop the ref for this MD because it was never + * posted to portals + */ + tx->tx_mdh = PTL_INVALID_HANDLE; + rc = -ENOMEM; + goto end; + } + + /* + * And save the portals message + */ + tx->tx_ptlmsg = lntmsg; + + /* + * Queue the request on the peer + */ + kptllnd_peer_queue_bulk_rdma_tx_locked(peer,tx); + + /* + * Grab a ref so the TX doesn't dissappear + */ + kptllnd_tx_addref(tx); + + spin_unlock(&peer->peer_lock); + + + /* + * Do the Put + */ + if( op == PTL_MD_OP_PUT) + { + ptl_rc = PtlPut ( + tx->tx_mdh, + PTL_NOACK_REQ, /* we dont need an ack */ + rx->rx_initiator, /* peer "address" */ + *kptllnd_tunables.kptl_portal, /* portal */ + 0, /* cookie */ + rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */ + 0, /* offset - unused */ + 0); /* header data */ + }else{ + ptl_rc = PtlGet ( + tx->tx_mdh, + rx->rx_initiator, /* peer "address" */ + *kptllnd_tunables.kptl_portal, /* portal */ + 0, /* cookie */ + rxmsg->ptlm_u.req.kptlrm_matchbits, /* match bits */ + 0); /* offset - unused*/ + } + + if(ptl_rc != PTL_OK){ + CERROR("Ptl%s failed: %d\n", + op == PTL_MD_OP_GET ? "Get" : "Put",ptl_rc); + + spin_lock(&peer->peer_lock); + + /* + * Unlink the MD because it's not yet in use + * this should happen immediately + */ + LASSERT(atomic_read(&tx->tx_refcount)>1); + ptl_rc2 = PtlMDUnlink(tx->tx_mdh); + LASSERT(ptl_rc2 == PTL_OK); + +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + /* If we have LUSTRE Portals UNLINK semantics + * we'll get an unlink event. If we have standard + * Portals semantics we decref the TX explicitly here + */ + tx->tx_mdh = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); +#endif + /* + * We are returning failure so we don't + * want tx_done to finalize the message + * so we set it to zero + */ + tx->tx_ptlmsg = 0; + + kptllnd_peer_dequeue_tx_locked(peer,tx); + tx->tx_peer = NULL; + + spin_unlock(&peer->peer_lock); + + rc = -ENOMEM; + goto end; + } + + rc = 0; + +end: + /* + * Release our temporary reference + * (this one could be the last) + */ + kptllnd_tx_decref(tx); + + PJK_UT_MSG("<<< rc=%d\n",rc); + return rc; +} + + +void +kptlnd_do_put( + kptl_tx_t *tx, + lnet_msg_t *lntmsg, + lnet_hdr_t *hdr, + kptl_data_t *kptllnd_data, + lnet_process_id_t target, + unsigned int payload_niov, + struct iovec *payload_iov, + lnet_kiov_t *payload_kiov, + unsigned int payload_offset, + unsigned int payload_nob) +{ + LASSERT(tx != NULL); + + tx->tx_payload_niov = payload_niov; + tx->tx_payload_iov = payload_iov; + tx->tx_payload_kiov = payload_kiov; + tx->tx_payload_offset = payload_offset; + tx->tx_payload_nob = payload_nob; + + tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr; + kptllnd_init_msg (tx->tx_msg, + PLTLND_MSG_TYPE_PUT, + sizeof(kptl_request_msg_t)); + kptllnd_tx_launch(tx, target.nid,lntmsg); +} + +int +kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kptl_tx_t *tx = NULL; + kptl_data_t *kptllnd_data = ni->ni_data; + int nob; + int rc; + kptl_rx_t *rx = NULL; + + PJK_UT_MSG_DATA(">>> SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n"); + PJK_UT_MSG_DATA("nob=%d nov=%d offset=%d to %s r=%d\n", + payload_nob, payload_niov, payload_offset, + libcfs_id2str(target), + routing); + + if(routing) + STAT_UPDATE(kpx_send_routing); + if(target_is_router) + STAT_UPDATE(kpx_send_target_is_router); + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + /* + * we rely on this being true, as we only store hdr + * in the tx descriptor, and just ignore type + */ + LASSERT(hdr->type == type); + + switch (type) { + default: + LBUG(); + return -EINVAL; + + case LNET_MSG_PUT: + PJK_UT_MSG_DATA("LNET_MSG_PUT\n"); + + /* + * Get an idle tx descriptor + * may block: caller is app thread (That's the "1" param) + */ + tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_PUT); + if(tx == NULL){ + CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", + type, target.nid); + return -ENOMEM; + } + + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]); + if (nob <= *kptllnd_tunables.kptl_max_immd_size) + break; + + kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target, + payload_niov,payload_iov, + payload_kiov,payload_offset,payload_nob); + + PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n"); + return 0; + + case LNET_MSG_GET: + + PJK_UT_MSG_DATA("LNET_MSG_GET nob=%d\n",lntmsg->msg_md->md_length); + + /* + * Get an idle tx descriptor + * may block: caller is app thread (That's the "1" param) + */ + tx = kptllnd_get_idle_tx(kptllnd_data,1,TX_TYPE_LARGE_GET); + if(tx == NULL){ + CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", + type, target.nid); + return -ENOMEM; + } + + /* + * If routing go immediate + */ + if(target_is_router || routing) + break; + + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[lntmsg->msg_md->md_length]); + if (nob <= *kptllnd_tunables.kptl_max_immd_size) + break; + + tx->tx_payload_offset = 0; + tx->tx_payload_niov = lntmsg->msg_md->md_niov; + tx->tx_payload_nob = lntmsg->msg_md->md_length; + + if((lntmsg->msg_md->md_options & PTL_MD_KIOV) != 0){ + tx->tx_payload_iov = 0; + tx->tx_payload_kiov = lntmsg->msg_md->md_iov.kiov; + }else{ + tx->tx_payload_iov = lntmsg->msg_md->md_iov.iov; + tx->tx_payload_kiov = 0; + } + + + tx->tx_msg->ptlm_u.req.kptlrm_hdr = *hdr; + kptllnd_init_msg (tx->tx_msg, + PTLLND_MSG_TYPE_GET, + sizeof(kptl_request_msg_t)); + + tx->tx_ptlmsg_reply = + lnet_create_reply_msg(kptllnd_data->kptl_ni,lntmsg); + + goto launch; + + case LNET_MSG_ACK: + PJK_UT_MSG_DATA("LNET_MSG_ACK\n"); + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_REPLY: + PJK_UT_MSG_DATA("LNET_MSG_REPLY\n"); + + /* + * Reply's private is the incoming rx descriptor + */ + rx = private; + LASSERT(rx != NULL); + + if(lntmsg==NULL) + { + /* + * Get an idle tx descriptor + * may NOT block That's the "0" param) + */ + tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_LARGE_PUT); + if(tx == NULL){ + CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", + type, target.nid); + return -ENOMEM; + } + + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]); + if (nob <= *kptllnd_tunables.kptl_max_immd_size) + break; + + kptlnd_do_put(tx,lntmsg,hdr,kptllnd_data,target, + payload_niov,payload_iov, + payload_kiov,payload_offset,payload_nob); + + PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n"); + return 0; + }else{ + /* + * If the request was to NOT do RDMA + * break out and just send back an IMMEDIATE message + */ + if (rx->rx_msg->ptlm_type == PTLLND_MSG_TYPE_IMMEDIATE) { + /* RDMA not expected */ + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]); + if (nob > *kptllnd_tunables.kptl_max_immd_size) { + CERROR("REPLY for "LPX64" too big but RDMA not requested:" + "%d (max for message is %d)\n", + target.nid, payload_nob, + *kptllnd_tunables.kptl_max_immd_size); + CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n", + nob, target.nid); + return -EINVAL; + } + break; + } + + + /* Incoming message consistent with RDMA? */ + if (rx->rx_msg->ptlm_type != PTLLND_MSG_TYPE_GET) { + CERROR("REPLY to "LPX64" bad msg type %x!!!\n", + target.nid, rx->rx_msg->ptlm_type); + return -EINVAL; + } + + rc = kptllnd_start_bulk_rdma( + kptllnd_data, + rx, + lntmsg, + PTL_MD_OP_PUT, + payload_niov, + payload_iov, + payload_kiov, + payload_offset, + payload_nob); + PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS rc=%d\n",rc); + return rc; + } + } + + + if(tx == NULL){ + PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n"); + + /* + * Get an idle tx descriptor + * may NOT block: (That's the "0" param) + */ + tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE); + if(tx == NULL){ + CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", + type, target.nid); + return -ENOMEM; + } + }else{ + PJK_UT_MSG_DATA("Using PTLLND_MSG_TYPE_IMMEDIATE\n"); + /* + * Repurpose this TX + */ + tx->tx_type = TX_TYPE_SMALL_MESSAGE; + + } + + LASSERT (offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]) + <= *kptllnd_tunables.kptl_max_immd_size); + + /* + * Setup the header + */ + tx->tx_msg->ptlm_u.immediate.kptlim_hdr = *hdr; + + if (payload_nob > 0) { + if (payload_kiov != NULL) + lnet_copy_kiov2flat( + *kptllnd_tunables.kptl_max_immd_size, + tx->tx_msg->ptlm_u.immediate.kptlim_payload, + 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat( + *kptllnd_tunables.kptl_max_immd_size, + tx->tx_msg->ptlm_u.immediate.kptlim_payload, + 0, + payload_niov, payload_iov, + payload_offset, payload_nob); + } + + nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]); + kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE,nob); + + +launch: + kptllnd_tx_launch(tx, target.nid,lntmsg); + PJK_UT_MSG_DATA("<<< SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n"); + return 0; +} + +int kptllnd_eager_recv(void *private,void **new_privatep) +{ + kptl_rx_t *rx = private; + + + LASSERT(rx->rx_nob < *kptllnd_tunables.kptl_max_immd_size); + + /* + * Copy the data directly into the RX + */ + memcpy(rx->rx_payload,rx->rx_msg,rx->rx_nob); + + *new_privatep = rx; + + /* + * Free the request buffer + * will repost of we are the last ones using it + */ + LASSERT(rx->rx_rxb != NULL); + kptllnd_rx_buffer_decref(rx->rx_rxb,"rx-eager"); + rx->rx_rxb = NULL; + + + /* + * Now point the msg buffer at the RX descriptor payload + * rather than the RXB (because that is now freed! + */ + rx->rx_msg = (kptl_msg_t*)rx->rx_payload; + + return 0; +} + + +int kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kptl_rx_t *rx = private; + kptl_msg_t *rxmsg = rx->rx_msg; + kptl_data_t *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data; + int nob; + int rc; + + PJK_UT_MSG_DATA(">>> RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\n"); + PJK_UT_MSG_DATA("niov=%d offset=%d mlen=%d rlen=%d\n", + niov,offset,mlen,rlen); + + LASSERT (mlen <= rlen); + LASSERT (mlen >= 0); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + if(delayed) + STAT_UPDATE(kpx_recv_delayed); + + switch(rxmsg->ptlm_type) + { + default: + LBUG(); + rc = -EINVAL; + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_IMMEDIATE\n"); + + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]); + if (nob > *kptllnd_tunables.kptl_max_immd_size) { + CERROR ("Immediate message from "LPX64" too big: %d\n", + rxmsg->ptlm_u.immediate.kptlim_hdr.src_nid, rlen); + rc = -EINVAL; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov( + niov, kiov, offset, + *kptllnd_tunables.kptl_max_immd_size, + rxmsg->ptlm_u.immediate.kptlim_payload, + 0, + mlen); + else + lnet_copy_flat2iov( + niov, iov, offset, + *kptllnd_tunables.kptl_max_immd_size, + rxmsg->ptlm_u.immediate.kptlim_payload, + 0, + mlen); + + lnet_finalize (ni, lntmsg, 0); + rc = 0; + break; + + case PTLLND_MSG_TYPE_GET: + PJK_UT_MSG_DATA("PTLLND_MSG_TYPE_GET\n"); + /* We get called here just to discard any junk after the + * GET hdr. */ + LASSERT (lntmsg == NULL); /* What is this all about ???*/ + + lnet_finalize (ni, lntmsg, 0); + + rc = 0; + break; + + case PLTLND_MSG_TYPE_PUT: + PJK_UT_MSG_DATA("PLTLND_MSG_TYPE_PUT\n"); + + if (mlen == 0) { /* No payload */ + lnet_finalize(ni, lntmsg, 0); + rc = 0; + }else{ + rc = kptllnd_start_bulk_rdma( + kptllnd_data, + rx, + lntmsg, + PTL_MD_OP_GET, + niov, + iov, + kiov, + offset, + mlen); + } + break; + } + + /* + * We're done with the RX + */ + kptllnd_rx_decref(rx,"lnet_parse",kptllnd_data); + + PJK_UT_MSG_DATA("<<< RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR rc=%d\n",rc); + return rc; +} + + +void +kptllnd_eq_callback(ptl_event_t *ev) +{ + kptl_posted_object_t *po = ev->md.user_ptr; + + /* + * Just delegate to the correct callback + * based on object type + */ + if(po->po_flags.pof_type == POSTED_OBJECT_TYPE_TX) + kptllnd_tx_callback(ev); + else + kptllnd_rx_buffer_callback(ev); +} + +typedef struct +{ + int id; /* The unique ID */ + kptl_data_t *kptllnd_data; /* pointer to the NAL instance */ + +}kptllnd_thread_data_t; + +void +kptllnd_thread_fini (kptllnd_thread_data_t *thread_data) +{ + atomic_dec (&thread_data->kptllnd_data->kptl_nthreads); + PORTAL_FREE(thread_data,sizeof(*thread_data)); +} + +int +kptllnd_thread_start (int (*fn)(void *arg), int id,kptl_data_t *kptllnd_data) +{ + long pid; + kptllnd_thread_data_t *thread_data; + + /* + * Allocate the tread data so we can pass more that + * one param to the thread function + */ + PORTAL_ALLOC (thread_data,sizeof(*thread_data)); + if(thread_data == 0){ + CERROR("No memory to allocated thread data structure\n"); + return 0; + } + + atomic_inc (&kptllnd_data->kptl_nthreads); + + /* + * Initialize thread data structure + */ + thread_data->id = id; + thread_data->kptllnd_data = kptllnd_data; + + pid = kernel_thread (fn, thread_data, 0); + + /* + * On error cleanup the context explicitly + */ + if (pid < 0){ + CERROR("Failed to start kernel_thread id=%d\n",id); + kptllnd_thread_fini(thread_data); + return (int)pid; + }else{ + return 0; + } +} + + + +int +kptllnd_scheduler(void *arg) +{ + kptllnd_thread_data_t *thread_data = arg; + int id = thread_data->id; + kptl_data_t *kptllnd_data = thread_data->kptllnd_data; + char name[16]; + cfs_waitlink_t waitlink; + int bucket =0; + int buckets_to_check; + cfs_time_t last_check = cfs_time_current(); + cfs_duration_t duration; + time_t duration_sec; + + PJK_UT_MSG(">>>\n"); + + /* + * Daemonize + */ + snprintf(name, sizeof(name), "kptllnd_sd_%02d", id); + libcfs_daemonize(name); + + cfs_waitlink_init(&waitlink); + + /* + * Keep going around + */ + while(!kptllnd_data->kptl_shutdown) { + + /* + * Wait on the scheduler waitq + */ + + set_current_state (TASK_INTERRUPTIBLE); + cfs_waitq_add(&kptllnd_data->kptl_sched_waitq, &waitlink); + cfs_waitq_timedwait(&waitlink,cfs_time_seconds(PTLLND_TIMEOUT_SEC)); + set_current_state (TASK_RUNNING); + cfs_waitq_del (&kptllnd_data->kptl_sched_waitq, &waitlink); + + + duration = cfs_time_sub(cfs_time_current(),last_check); + duration_sec = cfs_duration_sec(duration); + + /* + * Check all the buckets over the kptl_timeout inteval + * but just determine what percenations we are supposed to be + * checking now. + * Example + * (duration/HZ) = 5 sec + * HASH_SHZE = 100 + * kptl_timeout = 60 sec. + * Result = 8 buckets to be checked (actually 8.3) + */ + buckets_to_check = duration_sec * kptllnd_data->kptl_peer_hash_size / + (*kptllnd_tunables.kptl_timeout); + + if(buckets_to_check){ + /*PJK_UT_MSG("Check Buckets %d\n",buckets_to_check);*/ + STAT_UPDATE(kps_checking_buckets); + + /* + * Because we round down the buckets we need to store + * the left over portion (.3 in the above example) + * somewhere so we don't + * lose it. Do this but updating the last check now + * to "now" but rather to some time less than "now" that + * takes into account the routing error. + */ + last_check = cfs_time_add( last_check, + cfs_time_seconds(buckets_to_check * + *kptllnd_tunables.kptl_timeout / + kptllnd_data->kptl_peer_hash_size)); + + /* + * If we are supposed to check buckets then + * do that here. + */ + while(buckets_to_check){ + kptllnd_peer_check_bucket(bucket,kptllnd_data); + bucket = (bucket+1) % kptllnd_data->kptl_peer_hash_size; + --buckets_to_check; + } + } + + /* + * Drain the RX queue + */ + while(kptllnd_process_scheduled_rx(kptllnd_data)!=0); + + /* + * Repost all RXBs + */ + while(kptllnd_process_scheduled_rxb(kptllnd_data)!=0); + + /* + * Drain the TX queue. Note RX's can cause new TX's + * to be added to the queue. + */ + while(kptllnd_process_scheduled_tx(kptllnd_data)!=0); + + + /* + * Clean any canceled peers + */ + kptllnd_clean_canceled_peers(kptllnd_data); + } + + kptllnd_thread_fini(thread_data); + PJK_UT_MSG("<<<\n"); + return (0); +} + + +int kptllnd_process_scheduled_tx(kptl_data_t *kptllnd_data) +{ + kptl_tx_t *tx = 0; + unsigned long flags; + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + + /* + * If the list is not empty, grab the first one + * and pull it off the list + */ + if(!list_empty(&kptllnd_data->kptl_sched_txq)){ + tx = list_entry (kptllnd_data->kptl_sched_txq.next, + kptl_tx_t, tx_schedlist); + list_del_init(&tx->tx_schedlist); + } + + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); + + if(tx){ + PJK_UT_MSG(">>> tx=%p\n",tx); + kptllnd_tx_done(tx); + PJK_UT_MSG("<<<\n"); + } + + return tx!=NULL; +} + +int kptllnd_process_scheduled_rx(kptl_data_t *kptllnd_data) +{ + kptl_rx_t *rx = 0; + unsigned long flags; + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + + /* + * If the list is not empty, grab the first one + * and pull it off the list + */ + if(!list_empty(&kptllnd_data->kptl_sched_rxq)){ + rx = list_entry (kptllnd_data->kptl_sched_rxq.next, + kptl_rx_t, rx_list); + list_del_init(&rx->rx_list); + } + + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); + + if(rx) + kptllnd_rx_scheduler_handler(rx); + + return rx!=NULL; +} + +int kptllnd_process_scheduled_rxb(kptl_data_t *kptllnd_data) +{ + kptl_rx_buffer_t *rxb = 0; + unsigned long flags; + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + + /* + * If the list is not empty, grab the first one + * and pull it off the list + */ + if(!list_empty(&kptllnd_data->kptl_sched_rxbq)){ + rxb = list_entry (kptllnd_data->kptl_sched_rxbq.next, + kptl_rx_buffer_t, rxb_repost_list); + list_del_init(&rxb->rxb_repost_list); + } + + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); + + if(rxb) + kptllnd_rx_buffer_post_handle_error(rxb); + + return rxb!=NULL; +} + +void kptllnd_clean_canceled_peers(kptl_data_t *kptllnd_data) +{ + unsigned long flags; + kptl_peer_t *peer; + struct list_head *iter; + int counter; + + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + + + if(!list_empty(&kptllnd_data->kptl_canceled_peers)){ + PJK_UT_MSG("Cleaning Canceled Peers\n"); + STAT_UPDATE(kps_cleaning_caneled_peers); + } + +again: + counter = kptllnd_data->kptl_canceled_peers_counter; + + list_for_each(iter, &kptllnd_data->kptl_canceled_peers) { + peer = list_entry (iter, kptl_peer_t, peer_list); + + + /* + * Take reference so we can manipulate it + * outside the lock + * */ + kptllnd_peer_addref(peer,"temp"); + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); + + kptllnd_peer_cancel(peer); + kptllnd_peer_decref(peer,"temp"); + + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + + /* + * if the list has changed then we need to start again + */ + if(counter != kptllnd_data->kptl_canceled_peers_counter) + goto again; + } + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); +} diff --git a/lnet/klnds/ptllnd/ptllnd_modparams.c b/lnet/klnds/ptllnd/ptllnd_modparams.c new file mode 100644 index 0000000..f9ff67e --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_modparams.c @@ -0,0 +1,168 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * Author: PJ Kirner + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#include "ptllnd.h" + +static int ntx = PTLLND_NTX; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of 'normal' message descriptors"); + +static int ntx_nblk = PTLLND_NTX_NBLK; +CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, + "# of 'reserved' message descriptors"); + +static int concurrent_peers = PTLLND_CONCURRENT_PEERS; +CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = PTLLND_CKSUM; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = PTLLND_TIMEOUT; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int portal = PTLLND_PORTAL; +CFS_MODULE_PARM(portal, "i", int, 0444, + "portal id"); + +static int rxb_npages = PTLLND_RXB_NPAGES; +CFS_MODULE_PARM(rxb_npages, "i", int, 0444, + "# of pages for rx buffers"); + +static int credits = PTLLND_CREDITS; +CFS_MODULE_PARM(credits, "i", int, 0444, + "concurrent sends"); + +static int peercredits = PTLLND_PEERCREDITS; +CFS_MODULE_PARM(peercredits, "i", int, 0444, + "concurrent sends to 1 peer"); + +static int max_immd_size = PTLLND_MAX_MSG_SIZE; +CFS_MODULE_PARM(max_immd_size, "i", int, 0444, + "max size of immediate message"); + +static int peer_hash_table_size = PTLLND_PEER_HASH_SIZE; +CFS_MODULE_PARM(peer_hash_table_size, "i", int, 0444, + "# of slots in the peer hash table"); + +#ifdef PJK_DEBUGGING +static int simulation_bitmap = 0; +CFS_MODULE_PARM(simulation_bitmap, "i", int, 0444, + "simulation bitmap"); +#endif + + +kptl_tunables_t kptllnd_tunables = { + .kptl_ntx = &ntx, + .kptl_ntx_nblk = &ntx_nblk, + .kptl_concurrent_peers = &concurrent_peers, + .kptl_cksum = &cksum, + .kptl_portal = &portal, + .kptl_timeout = &timeout, + .kptl_rxb_npages = &rxb_npages, + .kptl_credits = &credits, + .kptl_peercredits = &peercredits, + .kptl_max_immd_size = &max_immd_size, + .kptl_peer_hash_table_size = &peer_hash_table_size, +#ifdef PJK_DEBUGGING + .kptl_simulation_bitmap = &simulation_bitmap, +#endif +}; + + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + +static ctl_table kptllnd_ctl_table[] = { + {1, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "ntx_nblk", &ntx_nblk, + sizeof(int), 0444, NULL, &proc_dointvec}, + {3, "concurrent_peers", &concurrent_peers, + sizeof(int), 0444, NULL, &proc_dointvec}, + {4, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {5, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {6, "portal", &portal, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "rxb_npages", &rxb_npages, + sizeof(int), 0444, NULL, &proc_dointvec}, + {8, "credits", &kptl_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "peercredits", &kptl_peercredits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "max_immd_size", &kptl_max_immd_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {11, "peer_hash_table_size,", &kptl_peer_hash_table_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + +#ifdef PJK_DEBUGGING + {12, "simulation_bitmap,", &kptl_simulation_bitmap, + sizeof(int), 0444, NULL, &proc_dointvec}, +#endif + + {0} +}; + +static ctl_table kptllnd_top_ctl_table[] = { + {203, "ptllnd", NULL, 0, 0555, kptllnd_ctl_table}, + {0} +}; + +int +kptllnd_tunables_init () +{ + kptllnd_tunables.kptl_sysctl = + register_sysctl_table(kptllnd_top_ctl_table, 0); + + if (kptllnd_tunables.kptl_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kptllnd_tunables_fini () +{ + if (kptllnd_tunables.kptl_sysctl != NULL) + unregister_sysctl_table(kptllnd_tunables.kptl_sysctl); +} + +#else + +int +kptllnd_tunables_init () +{ + return 0; +} + +void +kptllnd_tunables_fini () +{ +} + +#endif + diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c new file mode 100644 index 0000000..bbe1979 --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -0,0 +1,1315 @@ +#include "ptllnd.h" +#include + +void +kptllnd_peer_destroy ( + kptl_peer_t *peer); + +kptl_peer_t * +kptllnd_peer_find_locked ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid); + + + +int +kptllnd_peer_create_locked ( + kptl_data_t *kptllnd_data, + kptl_peer_t **peerp, + lnet_nid_t nid) +{ + kptl_peer_t *peer; + int rc; + + PJK_UT_MSG(">>> nid="LPX64"\n",nid); + + LASSERT (nid != PTL_NID_ANY); + + /* + * But first check we haven't exceeded or maximum + * number of peers + */ + if (atomic_read(&kptllnd_data->kptl_npeers) >= + *kptllnd_tunables.kptl_concurrent_peers) { + STAT_UPDATE(kps_too_many_peers); + CERROR("Can't create peer: too many peers\n"); + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } + + PORTAL_ALLOC(peer, sizeof (*peer)); + if (peer == NULL) { + CERROR("Cannot allocate memory for peer\n"); + return -ENOMEM; + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + INIT_LIST_HEAD (&peer->peer_list); /* not in the peer table yet */ + INIT_LIST_HEAD (&peer->peer_pending_txs); + INIT_LIST_HEAD (&peer->peer_active_txs); + spin_lock_init (&peer->peer_lock); + + + peer->peer_state = PEER_STATE_WAITING_HELLO; + peer->peer_kptllnd_data = kptllnd_data; + peer->peer_nid = nid; + //peer->peer_incarnation = 0; + //peer->peer_tx_seqnum = 0; + + /* + * Just enough to send the connect message + */ + peer->peer_credits = 1; + + /* + * We just posted this many buffers ready for the peer + * to send into, so give back this many credits + */ + peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1; + + + peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; + //peer->peer_last_matchbits_seen = 0; + + + /* + * Reserve space in the RX buffer pool for this new peer + */ + rc = kptllnd_rx_buffer_pool_reserve( + &kptllnd_data->kptl_rx_buffer_pool, + kptllnd_data, + *kptllnd_tunables.kptl_peercredits); + if(rc != 0){ + CERROR("Cannot reserve rx buffer pool space\n"); + PORTAL_FREE(peer, sizeof (*peer)); + return rc; + } + + /* + * 1 ref for the list + * 1 for the caller + */ + atomic_set (&peer->peer_refcount, 2); + + /* npeers only grows with the global lock held */ + atomic_inc(&kptllnd_data->kptl_npeers); + + /* And add this to the list */ + list_add_tail (&peer->peer_list, + kptllnd_nid2peerlist (kptllnd_data,nid)); + + STAT_UPDATE(kps_peers_created); + + PJK_UT_MSG("<<< Peer=%p nid="LPX64"\n",peer,nid); + *peerp = peer; + return 0; +} + + +void +kptllnd_peer_destroy ( + kptl_peer_t *peer) +{ + kptl_data_t *kptllnd_data = peer->peer_kptllnd_data; + + PJK_UT_MSG("Peer=%p\n",peer); + + LASSERT (atomic_read (&peer->peer_refcount) == 0); + /* Not on the peer list */ + LASSERT (list_empty (&peer->peer_list)); + /* No pending tx descriptors */ + LASSERT (list_empty (&peer->peer_pending_txs)); + /* No active tx descriptors */ + LASSERT (list_empty (&peer->peer_active_txs)); + + PORTAL_FREE (peer, sizeof (*peer)); + + kptllnd_rx_buffer_pool_unreserve( + &kptllnd_data->kptl_rx_buffer_pool, + *kptllnd_tunables.kptl_peercredits); + + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec(&kptllnd_data->kptl_npeers); +} + + +void +kptllnd_peer_addref ( + kptl_peer_t *peer, + const char *owner) +{ + atomic_inc(&peer->peer_refcount); + + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner, + atomic_read(&peer->peer_refcount)); +} + +void +kptllnd_peer_decref ( + kptl_peer_t *peer, + const char *owner) +{ + unsigned long flags; + kptl_data_t *kptllnd_data = peer->peer_kptllnd_data; + + if( !atomic_dec_and_test(&peer->peer_refcount)){ + + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("peer=%p owner=%s count=%d\n",peer,owner, + atomic_read(&peer->peer_refcount)); + return; + } + + PJK_UT_MSG("peer=%p owner=%s LAST REF\n",peer,owner); + + write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + list_del_init (&peer->peer_list); + if(peer->peer_state == PEER_STATE_CANCELED) + kptllnd_data->kptl_canceled_peers_counter++; + write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); + + kptllnd_peer_destroy(peer); +} + + +void +kptllnd_peer_cancel_pending_txs( + kptl_peer_t *peer) +{ + struct list_head list; + struct list_head *tx_temp; + struct list_head *tx_next; + kptl_tx_t *tx; + + + INIT_LIST_HEAD (&list); + + /* + * Tranfer all the PENDING TX's to a temporary list + * while holding the peer lock + */ + spin_lock(&peer->peer_lock); + + if(!list_empty(&peer->peer_pending_txs)) + PJK_UT_MSG("Clearing Pending TXs\n"); + + list_for_each_safe (tx_temp, tx_next, &peer->peer_pending_txs) { + tx = list_entry (tx_temp, kptl_tx_t, tx_list); + + list_del_init(&tx->tx_list); + list_add(&tx->tx_list,&list); + } + + spin_unlock(&peer->peer_lock); + + /* + * Now relese the refereces outside of the peer_lock + */ + list_for_each_safe (tx_temp, tx_next, &list) { + tx = list_entry (tx_temp, kptl_tx_t, tx_list); + list_del_init(&tx->tx_list); + kptllnd_tx_decref(tx); + } +} + +void +kptllnd_peer_cancel_active_txs( + kptl_peer_t *peer) +{ + struct list_head *iter; + kptl_tx_t *tx; + ptl_err_t ptl_rc; + int counter; + + spin_lock(&peer->peer_lock); + + if(!list_empty(&peer->peer_active_txs)) + PJK_UT_MSG("Clearing Active TXs\n"); + +again: + + counter = peer->peer_active_txs_change_counter; + + list_for_each (iter, &peer->peer_active_txs) { + tx = list_entry (iter, kptl_tx_t, tx_list); + + /* + * Hold onto one ref so we can make these + * unlink calls even though we have + * released the lock + */ + kptllnd_tx_addref(tx); + + spin_unlock(&peer->peer_lock); + + + /* + * Question: Why is it safe to acces tx_mdh and tx_mdh + * outside the peer_lock. We could be racing with + * tx_callback? + */ + + if(!PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)){ + PJK_UT_MSG("Unlink mhd_msg\n"); + LASSERT(atomic_read(&tx->tx_refcount)>1); + ptl_rc = PtlMDUnlink(tx->tx_mdh_msg); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + if(ptl_rc == PTL_OK) { + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); + } +#endif + } + + if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){ + PJK_UT_MSG("Unlink mdh\n"); + LASSERT(atomic_read(&tx->tx_refcount)>1); + ptl_rc = PtlMDUnlink(tx->tx_mdh); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + if(ptl_rc == PTL_OK){ + tx->tx_mdh = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); + } +#endif + } + + kptllnd_tx_decref(tx); + + spin_lock(&peer->peer_lock); + + /* + * If a change in the list has be detected + * go back to the beginning + */ + if( counter != peer->peer_active_txs_change_counter) + goto again; + } + + spin_unlock(&peer->peer_lock); +} + +void +kptllnd_peer_cancel( + kptl_peer_t *peer) +{ + kptl_data_t *kptllnd_data = peer->peer_kptllnd_data; + unsigned long flags; + int list_owns_ref=0; + + PJK_UT_MSG(">>> Peer=%p\n",peer); + + write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + if(peer->peer_state != PEER_STATE_CANCELED){ + peer->peer_state = PEER_STATE_CANCELED; + list_del_init(&peer->peer_list); + list_add(&peer->peer_list,&kptllnd_data->kptl_canceled_peers); + kptllnd_data->kptl_canceled_peers_counter++; + list_owns_ref = 1; + } + write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); + + + /* + * First cancel the pending and active TXs + */ + kptllnd_peer_cancel_pending_txs(peer); + kptllnd_peer_cancel_active_txs(peer); + + + /* lose peerlist's ref as long as we haven't done + this before */ + if(list_owns_ref) + kptllnd_peer_decref(peer,"list"); + + PJK_UT_MSG("<<< Peer=%p\n",peer); +} + +int +kptllnd_peer_del ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid) +{ + struct list_head *ptmp; + struct list_head *pnxt; + kptl_peer_t *peer; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + + PJK_UT_MSG(">>> NID="LPX64"\n",nid); + + /* + * Find the single bucket we are supposed to look at + * or if nid = PTL_NID_ANY then look at all of the buckets + */ + if (nid != PTL_NID_ANY) + lo = hi = kptllnd_nid2peerlist(kptllnd_data,nid) - kptllnd_data->kptl_peers; + else { + lo = 0; + hi = kptllnd_data->kptl_peer_hash_size - 1; + } + +again: + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kptllnd_data->kptl_peers[i]) { + peer = list_entry (ptmp, kptl_peer_t, peer_list); + + /* + * Is this the right one? + */ + if (!(nid == PTL_NID_ANY || peer->peer_nid == nid)) + continue; + + kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */ + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, + flags); + + kptllnd_peer_cancel(peer); + kptllnd_peer_decref(peer,"temp"); /* ...until here */ + + rc = 0; /* matched something */ + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); + + PJK_UT_MSG("<<< rc=%d\n",rc); + return (rc); +} + +void +kptllnd_peer_queue_tx_locked ( + kptl_peer_t *peer, + kptl_tx_t *tx) +{ + PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx); + + LASSERT(peer->peer_state != PEER_STATE_CANCELED); + LASSERT(tx->tx_state == TX_STATE_ALLOCATED); + tx->tx_state = TX_STATE_WAITING_CREDITS; + LASSERT(tx->tx_peer == NULL); + + kptllnd_peer_addref(peer,"tx"); + tx->tx_peer = peer; + + tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ); + list_add_tail(&tx->tx_list, &peer->peer_pending_txs); +} + +void +kptllnd_peer_queue_tx ( + kptl_peer_t *peer, + kptl_tx_t *tx) +{ + spin_lock(&peer->peer_lock); + kptllnd_peer_queue_tx_locked (peer, tx); + spin_unlock(&peer->peer_lock); + + kptllnd_peer_check_sends(peer); +} + + +void +kptllnd_peer_queue_bulk_rdma_tx_locked( + kptl_peer_t *peer, + kptl_tx_t *tx) +{ + PJK_UT_MSG("Peer=%p TX=%p\n",peer,tx); + + LASSERT(peer->peer_state != PEER_STATE_CANCELED); + LASSERT(tx->tx_state == TX_STATE_ALLOCATED); + tx->tx_state = TX_STATE_WAITING_RESPONSE; + + LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE || + tx->tx_type == TX_TYPE_LARGE_GET_RESPONSE); + + LASSERT(tx->tx_peer == NULL); + kptllnd_peer_addref(peer,"tx"); + tx->tx_peer = peer; + tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ); + + list_add_tail(&tx->tx_list, &peer->peer_active_txs); + peer->peer_active_txs_change_counter++; +} + +void +kptllnd_peer_dequeue_tx_locked( + kptl_peer_t *peer, + kptl_tx_t *tx) +{ + list_del_init(&tx->tx_list); + /* + * The tx could be on the active list + * or possibily the passive list. Either way + * we'll be safe an update the active txs list counter + * (this counter only indicates change, and in this + * case it's possible change, which is an acceptable + * usage) + */ + peer->peer_active_txs_change_counter++; +} + +void +kptllnd_peer_dequeue_tx( + kptl_peer_t *peer, + kptl_tx_t *tx) +{ + spin_lock(&peer->peer_lock); + kptllnd_peer_dequeue_tx_locked(peer,tx); + spin_unlock(&peer->peer_lock); +} + +void +kptllnd_peer_check_sends ( + kptl_peer_t *peer ) +{ + + kptl_tx_t *tx; + kptl_data_t *kptllnd_data = peer->peer_kptllnd_data; + int rc,rc2; + ptl_md_t md; + ptl_handle_me_t meh; + ptl_process_id_t target; + + /* + * If there is nothing to send, and we have hit the credit + * high water mark, then send a no-op message + */ + spin_lock(&peer->peer_lock); + + PJK_UT_MSG_DATA(">>>Peer=%p Credits=%d Outstanding=%d\n", + peer,peer->peer_credits,peer->peer_outstanding_credits); + + if(list_empty(&peer->peer_pending_txs) && + peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) { + + /* + * Get an idle tx descriptor + * may NOT block: (That's the "0" param) + */ + tx = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE); + if( tx == NULL ) { + CERROR ("Can't return credits to "LPX64": tx descs exhausted\n", + peer->peer_nid); + }else{ + kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP,0); + kptllnd_peer_queue_tx_locked(peer,tx); + } + } + /* + * Now go through all the sends to see what we can send + */ + while(!list_empty(&peer->peer_pending_txs)) { + tx = list_entry (peer->peer_pending_txs.next, kptl_tx_t, tx_list); + + LASSERT (tx->tx_state == TX_STATE_WAITING_CREDITS); + LASSERT (peer->peer_outstanding_credits >= 0); + LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits); + LASSERT (peer->peer_credits >= 0); + LASSERT (peer->peer_credits <= *kptllnd_tunables.kptl_peercredits); + + /* + * If there are no credits we're done + */ + if (peer->peer_credits == 0) { + STAT_UPDATE(kps_no_credits); + CDEBUG(D_NET, LPX64": no credits\n",peer->peer_nid); + break; + } + + + /* + * If there is one credit but we have no credits to give + * back then we don't use our one credit and we are done + */ + if (peer->peer_credits == 1 && + peer->peer_outstanding_credits == 0) { + STAT_UPDATE(kps_saving_last_credit); + CDEBUG(D_NET, LPX64": not using last credit\n", + peer->peer_nid); + break; + } + + /* + * Remove the tx from the list. We don't decrement the + * ref count here. The reference is simply transferred from + * the Peer to this calling function, and it will be this + * functions responsibility to dispose of the reference properly + */ + list_del_init(&tx->tx_list); + + /* + * If there is a NOOP in the queue but there + * are pending tx buffers also in the queue + * + * OR we are not at the high-water mark anymore + * + * THEN it is safe to simply discard this NOOP + * and continue one. + */ + if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP && + (!list_empty(&peer->peer_pending_txs) || + peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) { + /* redundant NOOP */ + kptllnd_tx_decref(tx); + CDEBUG(D_NET, LPX64": redundant noop\n", + peer->peer_nid); + continue; + } + + PJK_UT_MSG_DATA("--- TXTXTXTXTXTXTXTXTXTXTXTXTXTX\n"); + PJK_UT_MSG_DATA("Sending TX=%p Size=%d\n",tx,tx->tx_msg->ptlm_nob); + PJK_UT_MSG_DATA("Target nid="LPX64"\n",peer->peer_nid); + + + /* + * Assign matchbits for a put/get + */ + if(tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT || + tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET){ + + PJK_UT_MSG_DATA("next matchbits="LPX64" (before)\n", + peer->peer_next_matchbits); + + + /* Allocate a new match bits value. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... + * + * A set of match bits at the low end are reserved. So we can + * not use them. Just skip over them. This check protects us + * even in the case of 64-bit rollover. + */ + if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS){ + CDEBUG(D_INFO,"Match Bits Rollover for "LPX64"\n", + peer->peer_nid); + peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; + + } + + /* + * Set the payload match bits and update the peer's counter + */ + tx->tx_msg->ptlm_u.req.kptlrm_matchbits = + peer->peer_next_matchbits ++; + + PJK_UT_MSG_DATA("next matchbits="LPX64" (after)\n", + peer->peer_next_matchbits); + } + + /* + * Complete the message fill in all the rest + * of the header + */ + kptllnd_msg_pack( + tx->tx_msg, + peer->peer_outstanding_credits, + peer->peer_nid, + peer->peer_incarnation, + peer->peer_tx_seqnum, + kptllnd_data); + + /* + * We just sent a packet + */ + peer->peer_tx_seqnum++; + + /* + * And we've returned all of our credits + */ + peer->peer_outstanding_credits = 0; + + /* + * And we have one less credit :-( + */ + peer->peer_credits--; + + /* + * Set the state before the PtlPut() because + * we could get the PUT_END callback before PtlPut() + * returns. + */ + LASSERT(tx->tx_state == TX_STATE_WAITING_CREDITS); + tx->tx_state = TX_STATE_WAITING_RESPONSE; + + /* + * Construct an address that Portals needs from the NID + */ + + target.nid = lnet2ptlnid(kptllnd_data,peer->peer_nid); + target.pid = 0; + + PJK_UT_MSG_DATA("Msg NOB = %d\n",tx->tx_msg->ptlm_nob); + PJK_UT_MSG_DATA("Returned Credits=%d\n",tx->tx_msg->ptlm_credits); + PJK_UT_MSG_DATA("Seq # = "LPX64"\n",tx->tx_msg->ptlm_seq); + + PJK_UT_MSG("lnet TX nid=" LPX64 "\n",peer->peer_nid); + PJK_UT_MSG("ptl TX nid=" LPX64 "\n",target.nid); + + if(tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_GET || + tx->tx_msg->ptlm_type == PLTLND_MSG_TYPE_PUT){ + tempiov_t tempiov; + +#ifdef TESTING_WITH_LOOPBACK + /* + * When doing loopback testing the data comes back + * on the given loopback nid + */ + ptl_process_id_t target; + target.nid = PTL_NID_ANY; + target.pid = 0; +#endif + + PJK_UT_MSG_DATA("matchibts=" LPX64 "\n", + tx->tx_msg->ptlm_u.req.kptlrm_matchbits); + + + /* + * Attach the ME + */ + rc = PtlMEAttach( + kptllnd_data->kptl_nih, + *kptllnd_tunables.kptl_portal, + target, + tx->tx_msg->ptlm_u.req.kptlrm_matchbits, + 0, /* all matchbits are valid - ignore none*/ + PTL_UNLINK, + PTL_INS_BEFORE, + &meh); + if(rc != 0) { + CERROR("PtlMeAttach failed %d\n",rc); + goto failed; + } + + /* Setup the MD */ + kptllnd_setup_md(kptllnd_data,&md, + tx->tx_msg->ptlm_type == LNET_MSG_GET ? PTL_MD_OP_PUT : + PTL_MD_OP_GET, + tx, + tx->tx_payload_niov, + tx->tx_payload_iov, + tx->tx_payload_kiov, + tx->tx_payload_offset, + tx->tx_payload_nob, + &tempiov); + + /* + * Add a ref for this MD, because unlink + * events can happen at any time once + * something is posted. + */ + kptllnd_tx_addref(tx); + + /* + * Attach the MD + */ + rc = PtlMDAttach( + meh, + md, + PTL_UNLINK, + &tx->tx_mdh); + if(rc != 0){ + CERROR("PtlMDAttach failed %d\n",rc); + + /* + * Just drop the ref for this MD because it was never + * posted to portals + */ + tx->tx_mdh = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); + + rc2 = PtlMEUnlink(meh); + LASSERT(rc2 == 0); + goto failed; + } + } + + + /* + * Setup the MD + */ + md.start = tx->tx_msg; + md.length = tx->tx_msg->ptlm_nob; + md.threshold = 1; + md.options = PTL_MD_OP_PUT; + md.options |= PTL_MD_EVENT_START_DISABLE; + /* we don't need an ACK, we'll get a callback when the get is complete */ + md.options |= PTL_MD_ACK_DISABLE; + md.user_ptr = tx; + md.eq_handle = kptllnd_data->kptl_eqh; + + + /* + * Bind the MD + */ + rc = PtlMDBind ( + kptllnd_data->kptl_nih, + md, + PTL_UNLINK, + &tx->tx_mdh_msg); + if(rc != 0){ + CERROR("PtlMDBind failed %d\n",rc); + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + goto failed; + } + + list_add_tail(&tx->tx_list, &peer->peer_active_txs); + peer->peer_active_txs_change_counter++; + LASSERT(tx->tx_peer == peer); + + /* + * Grab a ref so the TX doesn't go away + * if we fail. + */ + kptllnd_tx_addref(tx); + + spin_unlock(&peer->peer_lock); + + rc = PtlPut ( + tx->tx_mdh_msg, + PTL_NOACK_REQ, /* we dont need an ack */ + target, /* peer "address" */ + *kptllnd_tunables.kptl_portal, /* portal */ + 0, /* cookie */ + LNET_MSG_MATCHBITS, /* match bits */ + 0, /* offset */ + 0); /* header data */ + if(rc != 0){ + CERROR("PtlPut error %d\n",rc); + + /* + * Do the unlink which should succeed + */ + LASSERT(atomic_read(&tx->tx_refcount)>1); + rc2 = PtlMDUnlink(tx->tx_mdh_msg); + LASSERT( rc2 == 0); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); +#endif + goto failed; + } + + /* + * Release our temporary reference + */ + kptllnd_tx_decref(tx); + + spin_lock(&peer->peer_lock); + + } + + + spin_unlock(&peer->peer_lock); + + PJK_UT_MSG_DATA("<<<\n"); + return; + +failed: + + /* + * Now unlink the MDs (if they were posted) + */ + if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){ + LASSERT(atomic_read(&tx->tx_refcount)>1); + rc2 = PtlMDUnlink(tx->tx_mdh); + /* + * The unlink should succeed + */ + LASSERT( rc2 == 0); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + tx->tx_mdh = PTL_INVALID_HANDLE; + kptllnd_tx_decref(tx); +#endif + } + + /* + * Get back the credits + * ??? WHY even do this because we're killing the peer + */ + peer->peer_outstanding_credits += tx->tx_msg->ptlm_credits; + peer->peer_credits++; + + spin_unlock(&peer->peer_lock); + + /* + * And cleanup this peer + */ + kptllnd_peer_cancel(peer); + + /* + * And release the tx reference + */ + kptllnd_tx_decref(tx); + + PJK_UT_MSG("<<< FAILED\n"); +} + +int +kptllnd_peer_timedout(kptl_peer_t *peer) +{ + kptl_tx_t *tx; + + spin_lock(&peer->peer_lock); + + /* + * Check the head of the pending list for expiration + * this is a queue, so if the head isn't expired then nothing + * else will be expired + */ + if(!list_empty(&peer->peer_pending_txs)){ + tx = list_entry(peer->peer_pending_txs.next,kptl_tx_t,tx_list); + if(time_after_eq(jiffies,tx->tx_deadline)){ + spin_unlock(&peer->peer_lock); + PJK_UT_MSG("Peer=%p PENDING tx=%p time=%lu sec\n", + peer,tx,(jiffies - tx->tx_deadline)/HZ); + return 1; + } + } + + /* + * Check the head of the active list + */ + if(!list_empty(&peer->peer_active_txs)){ + tx = list_entry(peer->peer_active_txs.next,kptl_tx_t,tx_list); + if(time_after_eq(jiffies,tx->tx_deadline)){ + spin_unlock(&peer->peer_lock); + PJK_UT_MSG("Peer=%p ACTIVE tx=%p time=%lu sec\n", + peer,tx,(jiffies - tx->tx_deadline)/HZ); + return 1; + } + } + + spin_unlock(&peer->peer_lock); + return 0; +} + + +void +kptllnd_peer_check_bucket (int idx, kptl_data_t *kptllnd_data) +{ + struct list_head *peers = &kptllnd_data->kptl_peers[idx]; + struct list_head *ptmp; + kptl_peer_t *peer; + unsigned long flags; + + + /*PJK_UT_MSG("Bucket=%d\n",idx);*/ + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kptl_peer_t, peer_list); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kptllnd_peer_check_sends(peer); + + if (!kptllnd_peer_timedout(peer)) + continue; + + kptllnd_peer_addref(peer,"temp"); /* 1 ref for me... */ + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, + flags); + + CERROR("Timed out RDMA with "LPX64"\n",peer->peer_nid); + + kptllnd_peer_cancel(peer); + kptllnd_peer_decref(peer,"temp"); /* ...until here */ + + /* start again now I've dropped the lock */ + goto again; + } + + read_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); +} + +kptl_peer_t * +kptllnd_peer_find ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid) +{ + kptl_peer_t *peer; + unsigned long flags; + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + peer = kptllnd_peer_find_locked(kptllnd_data,nid); + read_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + return peer; +} + +kptl_peer_t * +kptllnd_peer_find_locked ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid) +{ + struct list_head *peer_list = kptllnd_nid2peerlist (kptllnd_data,nid); + struct list_head *tmp; + kptl_peer_t *peer; + + PJK_UT_MSG(">>> nid="LPX64"\n",nid); + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, kptl_peer_t, peer_list); + + LASSERT(peer->peer_state != PEER_STATE_CANCELED); + + if (peer->peer_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->peer_refcount)); + + kptllnd_peer_addref(peer,"find"); + PJK_UT_MSG("<<< Peer=%p\n",peer); + return peer; + } + + PJK_UT_MSG("<<< NOTFOUND\n"); + return NULL; +} + +kptl_peer_t * +kptllnd_peer_handle_hello ( + kptl_data_t *kptllnd_data, + lnet_nid_t nid, + kptl_msg_t *msg) +{ + kptl_peer_t *peer; + kptl_peer_t *peer_to_cancel = 0; + unsigned long flags; + kptl_tx_t *tx_hello = 0; + int rc; + __u64 safe_matchbits_from_peer; + __u64 safe_matchbits_to_peer = 0; + + + PJK_UT_MSG(">>>\n"); + + safe_matchbits_from_peer = msg->ptlm_u.hello.kptlhm_matchbits + + *kptllnd_tunables.kptl_peercredits; + + /* + * Immediate message sizes MUST be equal + */ + if( msg->ptlm_u.hello.kptlhm_max_immd_size != + *kptllnd_tunables.kptl_max_immd_size){ + CERROR("IMMD message size MUST be equal for all peers got %d expected %d\n", + msg->ptlm_u.hello.kptlhm_max_immd_size, + *kptllnd_tunables.kptl_max_immd_size); + + return 0; + } + + write_lock_irqsave(&kptllnd_data->kptl_peer_rw_lock, flags); + + /* + * Look for peer because it could have been previously here + */ + peer = kptllnd_peer_find_locked(kptllnd_data,nid); + + /* + * If peer is already here + */ + if(peer != NULL){ + + if(peer->peer_incarnation == 0) { + /* + * Update the peer state + */ + LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO); + peer->peer_state = PEER_STATE_ACTIVE; + + /* + * Update the incarnation + */ + peer->peer_incarnation = msg->ptlm_srcstamp; + + /* + * Save the match bits + */ + PJK_UT_MSG_DATA(" **** Updating Matchbits="LPX64" ****\n", + safe_matchbits_from_peer); + + peer->peer_next_matchbits = safe_matchbits_from_peer; + if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS) + peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; + } + + /* + * If the incarnation has changed then we need to + * resend the hello. + */ + else if( peer->peer_incarnation != msg->ptlm_srcnid ) { + + /* + * Put the match bits into the hello message + */ + safe_matchbits_to_peer = + peer->peer_last_matchbits_seen + 1 + + *kptllnd_tunables.kptl_peercredits; + + /* + * Save this peer to cancel + */ + peer_to_cancel = peer; + peer = NULL; + + }else{ + CERROR("Receiving HELLO message on already connected peer " LPX64"\n",nid); + } + } + + if( peer == NULL) { + + /* + * Setup a connect HELLO message. We ultimately might not + * use it but likely we will. + */ + tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE); + if( tx_hello == NULL) { + CERROR("Unable to allocate connect message for "LPX64"\n",nid); + goto failed; + } + + kptllnd_init_msg( + tx_hello->tx_msg, + PTLLND_MSG_TYPE_HELLO, + sizeof(kptl_hello_msg_t)); + /* + * Put the match bits into the hello message + */ + tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits = + safe_matchbits_to_peer; + tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size = + *kptllnd_tunables.kptl_max_immd_size; + + rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, nid); + if(rc != 0){ + CERROR("Failed to create peer (nid="LPX64")\n",nid); + write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock, flags); + peer = NULL; + goto failed; + } + + LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO); + peer->peer_state = PEER_STATE_ACTIVE; + + /* + * NB We don't need to hold the peer->peer_lock + * because we haven't released the kptl_peer_rw_lock which + * holds prevents anyone else from getting a pointer to + * this newly created peer + */ + + /* + * Update the incarnation + */ + peer->peer_incarnation = msg->ptlm_srcstamp; + + /* + * Save the match bits + */ + PJK_UT_MSG_DATA("**** Setting Matchbits="LPX64" ****\n", + safe_matchbits_from_peer); + peer->peer_next_matchbits = safe_matchbits_from_peer; + if(peer->peer_next_matchbits < PTL_RESERVED_MATCHBITS) + peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; + + + /* + * And save them from a previous incarnation + */ + peer->peer_last_matchbits_seen = safe_matchbits_to_peer; + + /* + * Queue the message + */ + kptllnd_peer_queue_tx_locked(peer,tx_hello); + + /* + * And don't free it because it's queued + */ + tx_hello = 0; + + } + +failed: + write_unlock_irqrestore(&kptllnd_data->kptl_peer_rw_lock,flags); + + if(tx_hello) + kptllnd_tx_decref(tx_hello); + + /* + * + */ + if(peer){ + kptllnd_peer_check_sends(peer); + } + + if(peer_to_cancel) { + kptllnd_peer_cancel(peer_to_cancel); + kptllnd_peer_decref(peer_to_cancel,"find"); + } + + PJK_UT_MSG("<<< Peer=%p\n",peer); + + return peer; +} + +void +kptllnd_tx_launch ( + kptl_tx_t *tx, + lnet_nid_t target_nid, + lnet_msg_t *ptlmsg ) +{ + kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data; + kptl_peer_t *peer; + unsigned long flags; + rwlock_t *g_lock = &kptllnd_data->kptl_peer_rw_lock; + int rc; + kptl_tx_t *tx_hello; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + PJK_UT_MSG(">>> TX=%p nid="LPX64"\n",tx,target_nid); + + LASSERT (tx->tx_ptlmsg == NULL); + tx->tx_ptlmsg = ptlmsg; /* finalize ptlmsg on completion */ + + LASSERT (tx->tx_peer == NULL); /* only set when assigned a peer */ + + + /* + * First try to find the peer (this will grab the + * read lock + */ + peer = kptllnd_peer_find (kptllnd_data,target_nid); + + /* + * If we find the peer + * then just queue the tx + * (which could send it) + */ + if (peer != NULL) { + kptllnd_peer_queue_tx ( peer, tx ); + kptllnd_peer_decref(peer,"find"); + PJK_UT_MSG("<<< FOUND\n"); + return; + } + + + /* + * Since we didn't find the peer + * Setup a HELLO message. We ultimately might not use it + * (in the case that the peer is racing to connect with us) + * but more than likely we will. + */ + tx_hello = kptllnd_get_idle_tx(kptllnd_data,0,TX_TYPE_SMALL_MESSAGE); + if( tx_hello == NULL) { + CERROR("Unable to allocate connect message for "LPX64"\n",target_nid); + kptllnd_tx_decref (tx); + kptllnd_peer_decref(peer,"find"); + return; + } + + kptllnd_init_msg( + tx_hello->tx_msg, + PTLLND_MSG_TYPE_HELLO, + sizeof(kptl_hello_msg_t)); + + + /* + * Now try again with the exclusive lock + * so if it's not found we'll add it + */ + write_lock_irqsave(g_lock, flags); + + peer = kptllnd_peer_find_locked (kptllnd_data,target_nid); + + /* + * If we find the peer + * then just queue the tx + * (which could send it) + */ + if (peer != NULL) { + write_unlock_irqrestore(g_lock, flags); + + CDEBUG(D_TRACE,"HELLO message race occurred (nid="LPX64")\n",target_nid); + + kptllnd_peer_queue_tx ( peer, tx ); + kptllnd_peer_decref(peer,"find"); + + /* and we don't need the connection tx*/ + kptllnd_tx_decref(tx_hello); + + PJK_UT_MSG("<<< FOUND2\n"); + return; + } + + PJK_UT_MSG("TX %p creating NEW PEER nid="LPX64"\n",tx,target_nid); + rc = kptllnd_peer_create_locked ( kptllnd_data, &peer, target_nid); + if(rc != 0){ + CERROR("Failed to create peer (nid="LPX64")\n",target_nid); + write_unlock_irqrestore(g_lock, flags); + kptllnd_tx_decref (tx); + kptllnd_tx_decref (tx_hello); + kptllnd_peer_decref(peer,"find"); + return; + } + + + /* + * We've never seen this peer before. So setup + * a default message. + */ + tx_hello->tx_msg->ptlm_u.hello.kptlhm_matchbits = 0; + tx_hello->tx_msg->ptlm_u.hello.kptlhm_max_immd_size = + *kptllnd_tunables.kptl_max_immd_size; + + /* + * Queue the connection request + * and the actually tx. We have one credit so + * the connection request will go out, and + * the tx will wait for a reply. + */ + PJK_UT_MSG("TXHello=%p\n",tx_hello); + kptllnd_peer_queue_tx_locked(peer,tx_hello); + kptllnd_peer_queue_tx_locked(peer,tx); + + write_unlock_irqrestore(g_lock,flags); + + kptllnd_peer_check_sends(peer); + kptllnd_peer_decref(peer,"find"); + + PJK_UT_MSG("<<<\n"); +} diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c new file mode 100644 index 0000000..ea68b86 --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -0,0 +1,940 @@ +#include "ptllnd.h" + +kptl_rx_t* +kptllnd_rx_alloc( + kptl_data_t *kptllnd_data ); + +void +kptllnd_rx_schedule (kptl_rx_t *rx); + +void +kptllnd_rx_buffer_destroy( + kptl_rx_buffer_t *rxb); +int +kptllnd_rx_buffer_post( + kptl_rx_buffer_t *rxb); + +void +kptllnd_rx_buffer_addref( + kptl_rx_buffer_t *rxb, + const char *owner); + +void +kptllnd_rx_buffer_pool_init( + kptl_rx_buffer_pool_t *rxbp) +{ + PJK_UT_MSG("kptllnd_rx_buffer_pool_init\n"); + memset(rxbp,0,sizeof(*rxbp)); + + spin_lock_init (&rxbp->rxbp_lock); + INIT_LIST_HEAD (&rxbp->rxbp_list); + +} + +void +kptllnd_rx_buffer_pool_fini( + kptl_rx_buffer_pool_t *rxbp) +{ + kptl_rx_buffer_t *rxb; + int rc; + int i; + + PJK_UT_MSG("kptllnd_rx_buffer_pool_fini\n"); + + spin_lock(&rxbp->rxbp_lock); + + /* + * Set the shutdown flag under the lock + */ + rxbp->rxbp_shutdown = 1; + + i = 2; + while(!list_empty(&rxbp->rxbp_list)) + { + struct list_head* iter; + int count = 0; + + /* + * Count how many items are on the list right now + */ + list_for_each(iter,&rxbp->rxbp_list) + ++count; + + CDEBUG(D_TRACE,"|rxbp_list|=%d\n",count); + + /* + * Loop while we still have items on the list + * ore we've going through the list once + */ + while(!list_empty(&rxbp->rxbp_list) && count!=0) + { + --count; + rxb = list_entry (rxbp->rxbp_list.next, + kptl_rx_buffer_t, rxb_list); + + LASSERT(rxb->rxb_state == RXB_STATE_POSTED); + + + list_del_init(&rxb->rxb_list); + + /* + * We have hit the one race where the MD has been put + * on the list, but the MD is not created. + */ + if(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE)){ + list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list); + continue; + } + + + /* + * Keep the RXB from being deleted + */ + kptllnd_rx_buffer_addref(rxb,"temp"); + + spin_unlock(&rxbp->rxbp_lock); + + /* + * Unlinked the MD + */ + LASSERT(atomic_read(&rxb->rxb_refcount)>1); + rc = PtlMDUnlink(rxb->rxb_mdh); + if(rc == 0){ +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + rxb->rxb_mdh = PTL_INVALID_HANDLE; + kptllnd_rx_buffer_decref(rxb,"portals"); +#endif + /* + * Drop the reference we took above + */ + kptllnd_rx_buffer_decref(rxb,"temp"); + + spin_lock(&rxbp->rxbp_lock); + }else{ + PJK_UT_MSG("PtlMDUnlink(%p) rc=%d\n",rxb,rc); + /* + * The unlinked failed so put this back + * on the list for later + */ + spin_lock(&rxbp->rxbp_lock); + + list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list); + + /* + * Drop the reference we took above + */ + kptllnd_rx_buffer_decref(rxb,"temp"); + } + } + + /* + * If there are still items on the list we + * need to take a break, and let the Busy RX's + * finish up. + */ + if(!list_empty(&rxbp->rxbp_list)){ + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d Busy RX Buffers\n", + rxbp->rxbp_count); + spin_unlock(&rxbp->rxbp_lock); + cfs_pause(cfs_time_seconds(1)); + spin_lock(&rxbp->rxbp_lock); + } + } + + CDEBUG(D_TRACE,"|rxbp_list|=EMPTY\n"); + + if(rxbp->rxbp_count != 0){ + PJK_UT_MSG("Waiting for %d RX Buffers to unlink\n",rxbp->rxbp_count); + + i = 2; + while (rxbp->rxbp_count != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d RX Buffers to unlink\n", + rxbp->rxbp_count); + spin_unlock(&rxbp->rxbp_lock); + cfs_pause(cfs_time_seconds(1)); + spin_lock(&rxbp->rxbp_lock); + } + } + + CDEBUG(D_TRACE,"|rxbp_count|=0\n"); + + spin_unlock(&rxbp->rxbp_lock); +} + + +int +kptllnd_rx_buffer_pool_reserve( + kptl_rx_buffer_pool_t *rxbp, + kptl_data_t *kptllnd_data, + int count) +{ + int add = 0; + int i; + int rc; + kptl_rx_buffer_t *rxb; + int nbuffers; + + spin_lock(&rxbp->rxbp_lock); + + PJK_UT_MSG("kptllnd_rx_buffer_pool_reserve(%d)\n",count); + + /* + * Prevent reservation of anymore while we are shutting down + */ + if(rxbp->rxbp_shutdown){ + spin_unlock(&rxbp->rxbp_lock); + return -ESHUTDOWN; + } + + /* + * Make the reservation + */ + rxbp->rxbp_reserved += count; + + /* + * Calcuate the number or buffers we need + * +1 to handle any rounding error + */ + nbuffers = (rxbp->rxbp_reserved) * + (*kptllnd_tunables.kptl_max_immd_size) / + (PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages)); + ++nbuffers ; + + PJK_UT_MSG("nbuffers=%d rxbp_count=%d\n",nbuffers,rxbp->rxbp_count); + + if(rxbp->rxbp_count < nbuffers) + add = nbuffers - rxbp->rxbp_count; + + PJK_UT_MSG("adding=%d\n",add); + + /* + * Under the same lock assume they are added + * we'll subtract if we hit an error. + */ + rxbp->rxbp_count += add; + spin_unlock(&rxbp->rxbp_lock); + + for(i=0;irxb_po, + kptllnd_data, + POSTED_OBJECT_TYPE_RXB); + + rxb->rxb_pool = rxbp; + rxb->rxb_state = RXB_STATE_IDLE; + rxb->rxb_mdh = PTL_INVALID_HANDLE; + INIT_LIST_HEAD (&rxb->rxb_list); + INIT_LIST_HEAD (&rxb->rxb_repost_list); + + PORTAL_ALLOC( rxb->rxb_buffer, + PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages); + if(rxb->rxb_buffer == NULL) { + CERROR("Failed to allocate data buffer or size %d pages for rx%d\n", + *kptllnd_tunables.kptl_rxb_npages,i); + rc = -ENOMEM; + goto failed; + } + + rc = kptllnd_rx_buffer_post(rxb); + if(rc != 0) + goto failed; + } + return 0; + +failed: + spin_lock(&rxbp->rxbp_lock); + + /* + * We really didn't add as many + * as we were planning to. + */ + rxbp->rxbp_count -= add - i; + + /* + * Cancel this reservation + */ + rxbp->rxbp_reserved -= count; + spin_unlock(&rxbp->rxbp_lock); + + + if(rxb){ + if(rxb->rxb_buffer) + PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages); + PORTAL_FREE( rxb,sizeof(*rxb)); + } + + return rc; +} + +void +kptllnd_rx_buffer_pool_unreserve( + kptl_rx_buffer_pool_t *rxbp, + int count) +{ + spin_lock(&rxbp->rxbp_lock); + PJK_UT_MSG("kptllnd_rx_buffer_pool_unreserve(%d)\n",count); + rxbp->rxbp_reserved -= count; + spin_unlock(&rxbp->rxbp_lock); +} + +void +kptllnd_rx_buffer_scheduled_post( + kptl_rx_buffer_t *rxb) +{ + kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data; + unsigned long flags; + + PJK_UT_MSG("rxb=%p\n",rxb); + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + LASSERT(list_empty(&rxb->rxb_repost_list)); + list_add_tail(&rxb->rxb_repost_list,&kptllnd_data->kptl_sched_rxbq); + wake_up(&kptllnd_data->kptl_sched_waitq); + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); +} + + +int +kptllnd_rx_buffer_post( + kptl_rx_buffer_t *rxb) +{ + int rc; + ptl_md_t md; + ptl_handle_me_t meh; + ptl_handle_md_t mdh; + ptl_process_id_t any; + kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data; + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + + any.nid = PTL_NID_ANY; + any.pid = PTL_PID_ANY; + + /*PJK_UT_MSG("rxb=%p\n",rxb);*/ + + spin_lock(&rxbp->rxbp_lock); + + /* + * No new RXB's can enter the POSTED state + */ + if(rxbp->rxbp_shutdown){ + spin_unlock(&rxbp->rxbp_lock); + return -ESHUTDOWN; + } + + LASSERT(!in_interrupt()); + + LASSERT(rxb->rxb_state == RXB_STATE_IDLE); + LASSERT(atomic_read(&rxb->rxb_refcount)==0); + LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE)); + + list_add_tail(&rxb->rxb_list,&rxbp->rxbp_list); + atomic_set(&rxb->rxb_refcount,1); + rxb->rxb_state = RXB_STATE_POSTED; + + spin_unlock(&rxbp->rxbp_lock); + + /* + * Attach the ME + */ + rc = PtlMEAttach( + kptllnd_data->kptl_nih, + *kptllnd_tunables.kptl_portal, + any, + LNET_MSG_MATCHBITS, + 0, /* all matchbits are valid - ignore none*/ + PTL_UNLINK, + PTL_INS_AFTER, + &meh); + if(rc != 0) { + CERROR("PtlMeAttach rxb failed %d\n",rc); + goto failure; + } + + /* + * Setup MD + */ + md.start = rxb->rxb_buffer; + md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages; + md.threshold = PTL_MD_THRESH_INF; + md.options = PTL_MD_OP_PUT; + md.options |= PTL_MD_EVENT_START_DISABLE; + md.options |= PTL_MD_MAX_SIZE; + md.user_ptr = rxb; + md.max_size = *kptllnd_tunables.kptl_max_immd_size; + md.eq_handle = kptllnd_data->kptl_eqh; + + + /* + * Attach the MD + */ + rc = PtlMDAttach( + meh, + md, + PTL_UNLINK, + &mdh); + if(rc != 0){ + int rc2; + CERROR("PtlMDAttach rxb failed %d\n",rc); + rc2 = PtlMEUnlink(meh); + LASSERT(rc2 == 0); + goto failure; + } + + /* + * Assign the MDH under the lock + * to deal with shutdown race, of + * a partially constructed rbx + */ + spin_lock(&rxbp->rxbp_lock); + rxb->rxb_mdh = mdh; + spin_unlock(&rxbp->rxbp_lock); + + return 0; + + +failure: + /* + * Cleanup on error + */ + spin_lock(&rxbp->rxbp_lock); + list_del_init(&rxb->rxb_list); + atomic_set(&rxb->rxb_refcount,0); + rxb->rxb_state = RXB_STATE_IDLE; + spin_unlock(&rxbp->rxbp_lock); + + return rc; +} + +void +kptllnd_rx_buffer_post_handle_error( + kptl_rx_buffer_t *rxb) +{ + int rc; + rc = kptllnd_rx_buffer_post(rxb); + if(rc!=0){ + /* Don't log on shutdown */ + if(rc != -ESHUTDOWN) + CERROR("Failing to Repost buffer rc=%d\n",rc); + + kptllnd_rx_buffer_destroy(rxb); + /* Should I destroy the peer? + * I don't think so. But this now + * now means there is some chance + * under very heavy load that we will drop a packet. + * On the other hand, if there is more buffers in + * the pool that are reserved this won't happen. + * And secondly under heavly load it is liklye a + * a new peer will be added added, the reservation + * for the ones that were lost will + * get new backing buffers at that time. + * + * So things are starting to get bad, but + * in all likelihood things will be fine, + * and even better they might correct themselves + * in time. + */ + } +} + +void +kptllnd_rx_buffer_destroy( + kptl_rx_buffer_t *rxb) +{ + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + + LASSERT(atomic_read(&rxb->rxb_refcount) == 0); + LASSERT(rxb->rxb_state == RXB_STATE_IDLE); + LASSERT(PtlHandleIsEqual(rxb->rxb_mdh,PTL_INVALID_HANDLE)); + + spin_lock(&rxbp->rxbp_lock); + list_del(&rxb->rxb_list); + rxbp->rxbp_count--; + spin_unlock(&rxbp->rxbp_lock); + + PORTAL_FREE( rxb->rxb_buffer,PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages); + PORTAL_FREE(rxb,sizeof(*rxb)); +} + + + +void +kptllnd_rx_buffer_callback(ptl_event_t *ev) +{ + kptl_rx_buffer_t *rxb = ev->md.user_ptr; + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + /*kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data;*/ + kptl_rx_t *rx; + int nob; + int unlinked; + + /* + * Set the local unlinked flag + */ + unlinked = ev->type == PTL_EVENT_UNLINK; +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + if( ev->unlinked ) + unlinked = 1; +#endif + + if(!rxbp->rxbp_shutdown){ + PJK_UT_MSG("RXB Callback %s(%d) rxb=%p nid="LPX64" unlink=%d\n", + get_ev_type_string(ev->type),ev->type, + rxb,ev->initiator.nid,unlinked); + } + + LASSERT( ev->md.start == rxb->rxb_buffer); + LASSERT( ev->offset + ev->mlength <= PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages); + LASSERT( ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK); + LASSERT( ev->match_bits == LNET_MSG_MATCHBITS); + + CDEBUG((ev->ni_fail_type == PTL_OK) ? D_NET : D_ERROR, + "event type %d, status %d from "LPX64"\n", + ev->type, ev->ni_fail_type,ev->initiator.nid); + + nob = ev->mlength; + + if(unlinked){ + spin_lock(&rxbp->rxbp_lock); + + /* + * Remove this from the list + */ + list_del_init(&rxb->rxb_list); + + LASSERT(rxb->rxb_state == RXB_STATE_POSTED); + rxb->rxb_state = RXB_STATE_IDLE; + rxb->rxb_mdh = PTL_INVALID_HANDLE; + + if( rxbp->rxbp_shutdown){ + spin_unlock(&rxbp->rxbp_lock); + kptllnd_rx_buffer_decref(rxb,"portals"); + return; + } + + spin_unlock(&rxbp->rxbp_lock); + + + } + + /* + * Handle failure by just dropping the path + */ + if(ev->ni_fail_type != PTL_NI_OK){ + CERROR("Message Dropped: ev status %d",ev->ni_fail_type); + if(unlinked) + kptllnd_rx_buffer_scheduled_post(rxb); + return; + } + + /* + * Allocate an RX + */ + rx = kptllnd_rx_alloc(rxb->rxb_po.po_kptllnd_data); + if(rx == 0){ + CERROR("Message Dropped: Memory allocation failure"); + if(unlinked) + kptllnd_rx_buffer_scheduled_post(rxb); + return; + } + + PJK_UT_MSG_DATA("New RX=%p\n",rx); + + /* + * If we are unlinked we can just transfer the ref + * that portals owned to the ref that this RX owns + * otherwise we need to add a ref specifically for this RX + */ + if(!unlinked) + kptllnd_rx_buffer_addref(rxb,"rx"); + + rx->rx_msg = rxb->rxb_buffer + ev->offset; + rx->rx_rxb = rxb; + rx->rx_nob = nob; +#ifdef TESTING_WITH_LOOPBACK + /* + * When testing with loopback on socknal + * packets are received on loopback NAL so + * until I figure out how to do that properly + * just make it look like it came from this NID + */ + rx->rx_initiator = rxb->rxb_po.po_kptllnd_data->kptl_portals_id; +#else + rx->rx_initiator = ev->initiator; +#endif + + kptllnd_rx_schedule(rx); + + if(!rxbp->rxbp_shutdown){ + PJK_UT_MSG("<<< rx=%p rxb=%p\n",rx,rxb); + } +} + + +void +kptllnd_rx_schedule (kptl_rx_t *rx) +{ + unsigned long flags; + kptl_data_t *kptllnd_data = rx->rx_rxb->rxb_po.po_kptllnd_data; + + CDEBUG(D_NET, "rx\n"); + + PJK_UT_MSG("RX Schedule %p\n",rx); + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + list_add_tail(&rx->rx_list,&kptllnd_data->kptl_sched_rxq); + wake_up(&kptllnd_data->kptl_sched_waitq); + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); +} + + +void +kptllnd_rx_scheduler_handler(kptl_rx_t *rx) +{ + int rc; + kptl_rx_buffer_t *rxb = rx->rx_rxb; + kptl_msg_t *msg = rx->rx_msg; + kptl_data_t *kptllnd_data = rxb->rxb_po.po_kptllnd_data; + kptl_peer_t *peer = NULL; + int returned_credits = 0; + int type = msg->ptlm_type; + lnet_nid_t lnet_initiator_nid = ptl2lnetnid(kptllnd_data,rx->rx_initiator.nid); + + + PJK_UT_MSG_DATA(">>> RXRXRXRXRXRXRXRXRXRXRXRX\n"); + PJK_UT_MSG_DATA("rx=%p nob=%d\n",rx,rx->rx_nob); + + /* + * If the nob==0 then silently discard this message + */ + if(rx->rx_nob == 0) + goto exit; + + rc = kptllnd_msg_unpack(msg, rx->rx_nob, kptllnd_data); + if (rc != 0) { + CERROR ("Error %d unpacking rx from "LPX64"\n", + rc, rx->rx_initiator.nid); + goto exit; + } + + PJK_UT_MSG_DATA("RX=%p Type=%s(%d)\n",rx, + get_msg_type_string(type),type); + PJK_UT_MSG_DATA("Msg NOB = %d\n",msg->ptlm_nob); + PJK_UT_MSG_DATA("Returned Credits=%d\n",msg->ptlm_credits); + PJK_UT_MSG_DATA("Seq # ="LPX64"\n",msg->ptlm_seq); + PJK_UT_MSG_DATA("lnet RX nid=" LPX64 "\n",lnet_initiator_nid); + PJK_UT_MSG("ptl RX nid=" LPX64 "\n",rx->rx_initiator.nid); + + if(type == PTLLND_MSG_TYPE_HELLO) + { + peer = kptllnd_peer_handle_hello( + kptllnd_data, + lnet_initiator_nid, + msg); + if( peer == NULL){ + CERROR ("Failed to create peer for "LPX64"\n", + lnet_initiator_nid); + goto exit; + } + + if (!( msg->ptlm_dststamp == kptllnd_data->kptl_incarnation || + msg->ptlm_dststamp == 0)) { + CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n", + peer->peer_nid, + msg->ptlm_dststamp, + kptllnd_data->kptl_incarnation ); + goto exit; + } + } + else + { + peer = kptllnd_peer_find(kptllnd_data,lnet_initiator_nid); + if( peer == NULL){ + CERROR ("No connection with "LPX64"\n", + lnet_initiator_nid); + goto exit; + } + + if (msg->ptlm_dststamp != kptllnd_data->kptl_incarnation) { + CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n", + peer->peer_nid, + msg->ptlm_dststamp, + kptllnd_data->kptl_incarnation ); + goto exit; + } + } + + if( msg->ptlm_srcnid != peer->peer_nid){ + CERROR ("Stale rx srcnid "LPX64" expected "LPX64"\n", + msg->ptlm_srcnid, + peer->peer_nid ); + goto exit; + } + if( msg->ptlm_srcstamp != peer->peer_incarnation){ + CERROR ("Stale rx from "LPX64" srcstamp"LPX64" expected "LPX64"\n", + peer->peer_nid, + msg->ptlm_srcstamp, + peer->peer_incarnation ); + goto exit; + } + if( msg->ptlm_dstnid != kptllnd_data->kptl_ni->ni_nid){ + CERROR ("Stale rx from "LPX64" dststamp "LPX64" expected "LPX64"\n", + peer->peer_nid, + msg->ptlm_dstnid, + kptllnd_data->kptl_ni->ni_nid ); + goto exit; + } + + /* + * Save the number of credits + */ + returned_credits = msg->ptlm_credits; + + /* + * Attach the peer to the RX + * it now is responsibly for releaseing the refrence + */ + rx->rx_peer = peer; + peer = 0; + + /* + * Note: We are explicitly ignore sequence # + * It is informational only + */ + switch (msg->ptlm_type) { + default: + CERROR("Bad PTL message type %x from "LPX64"\n", + msg->ptlm_type, rx->rx_peer->peer_nid); + break; + + case PTLLND_MSG_TYPE_HELLO: + PJK_UT_MSG("PTLLND_MSG_TYPE_HELLO\n"); + break; + + case PTLLND_MSG_TYPE_NOOP: + PJK_UT_MSG("PTLLND_MSG_TYPE_NOOP\n"); + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + PJK_UT_MSG("PTLLND_MSG_TYPE_IMMEDIATE\n"); + rc = lnet_parse(kptllnd_data->kptl_ni, + &msg->ptlm_u.immediate.kptlim_hdr, + msg->ptlm_srcnid, + rx); + /* RX Completing asynchronously */ + if( rc >= 0) + rx = 0; + break; + + case PLTLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + PJK_UT_MSG("PTLLND_MSG_TYPE_%s\n", + msg->ptlm_type == PLTLND_MSG_TYPE_PUT ? + "PUT" : "GET"); + + /* + * Save the last match bits used + */ + spin_lock(&rx->rx_peer->peer_lock); + if(msg->ptlm_u.req.kptlrm_matchbits > rx->rx_peer->peer_last_matchbits_seen) + rx->rx_peer->peer_last_matchbits_seen = msg->ptlm_u.req.kptlrm_matchbits; + spin_unlock(&rx->rx_peer->peer_lock); + + rc = lnet_parse(kptllnd_data->kptl_ni, + &msg->ptlm_u.req.kptlrm_hdr, + msg->ptlm_srcnid, + rx); + + /* RX Completing asynchronously */ + if( rc >= 0) + rx = 0; + break; + } + + + CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n", + type, returned_credits, peer->peer_nid); + +exit: + /* PEER == NULL if it is not yet assigned or already + * been attached to RX */ + if(peer) + kptllnd_peer_decref(peer,"lookup"); + + /* RX == NULL if it is completing asynchronously */ + if(rx) + kptllnd_rx_decref(rx,"sched",kptllnd_data); + + PJK_UT_MSG_DATA("<<< RXRXRXRXRXRXRXRXRXRXRXRX rx=%p\n",rx); + return; +} + +void +kptllnd_rx_buffer_addref( + kptl_rx_buffer_t *rxb, + const char *owner) +{ + atomic_inc(&rxb->rxb_refcount); + +#if 0 + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner, + atomic_read(&rxb->rxb_refcount)); +#endif +} + +void +kptllnd_rx_buffer_decref( + kptl_rx_buffer_t *rxb, + const char *owner) +{ + if( !atomic_dec_and_test (&rxb->rxb_refcount)){ + +#if 0 + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("rxb=%p owner=%s count=%d\n",rxb,owner, + atomic_read(&rxb->rxb_refcount)); +#endif + return; + } + +#if 0 + PJK_UT_MSG("rxb=%p owner=%s LAST REF reposting\n",rxb,owner); +#endif + + kptllnd_rx_buffer_post_handle_error(rxb); +} + +kptl_rx_t* +kptllnd_rx_alloc( + kptl_data_t *kptllnd_data ) +{ + kptl_rx_t* rx; + + if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_RX_ALLOC )){ + PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n"); + CERROR ("FAIL_BLOCKING_RX_ALLOC SIMULATION triggered\n"); + STAT_UPDATE(kps_rx_allocation_failed); + return 0; + } + + rx = cfs_mem_cache_alloc ( kptllnd_data->kptl_rx_cache , GFP_ATOMIC); + if(rx == 0 ){ + CERROR("Failed to allocate rx\n"); + STAT_UPDATE(kps_rx_allocation_failed); + + }else{ + + STAT_UPDATE(kps_rx_allocated); + + memset(rx,0,sizeof(rx)); + + CFS_INIT_LIST_HEAD(&rx->rx_list); + atomic_set(&rx->rx_refcount,1); + } + + return rx; +} + +void +kptllnd_rx_destroy(kptl_rx_t *rx,kptl_data_t *kptllnd_data) +{ + kptl_peer_t *peer = rx->rx_peer; + kptl_msg_t *msg = rx->rx_msg; + int returned_credits = msg->ptlm_credits; + + PJK_UT_MSG(">>> rx=%p\n",rx); + + STAT_UPDATE(kps_rx_released); + + LASSERT(atomic_read(&rx->rx_refcount)==0); + + if(rx->rx_rxb){ + PJK_UT_MSG("Release rxb=%p\n",rx->rx_rxb); + kptllnd_rx_buffer_decref(rx->rx_rxb,"rx"); + rx->rx_rxb = 0; + }else{ + PJK_UT_MSG("rxb already released\n"); + } + + if(peer){ + + /* + * Update credits + * (Only after I've reposted the buffer) + */ + spin_lock(&peer->peer_lock); + peer->peer_credits += returned_credits; + LASSERT( peer->peer_credits <= + *kptllnd_tunables.kptl_peercredits); + peer->peer_outstanding_credits++; + LASSERT( peer->peer_outstanding_credits <= + *kptllnd_tunables.kptl_peercredits); + spin_unlock(&peer->peer_lock); + + PJK_UT_MSG_DATA("Giving Back %d credits rx=%p\n",returned_credits,rx); + + /* Have I received credits that will let me send? */ + if (returned_credits != 0) + kptllnd_peer_check_sends(peer); + + kptllnd_peer_decref(peer,"lookup"); + } + + cfs_mem_cache_free(kptllnd_data->kptl_rx_cache,rx); + + PJK_UT_MSG("<<< rx=%p\n",rx); +} + +void +kptllnd_rx_addref(kptl_rx_t *rx,const char *owner) +{ + atomic_inc(&rx->rx_refcount); + + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner, + atomic_read(&rx->rx_refcount)); +} + +void +kptllnd_rx_decref(kptl_rx_t *rx,const char *owner,kptl_data_t *kptllnd_data) +{ + if( !atomic_dec_and_test (&rx->rx_refcount)){ + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("rx=%p owner=%s count=%d\n",rx,owner, + atomic_read(&rx->rx_refcount)); + return; + } + + PJK_UT_MSG("rx=%p owner=%s LAST REF destroying\n",rx,owner); + + kptllnd_rx_destroy(rx,kptllnd_data); +} + diff --git a/lnet/klnds/ptllnd/ptllnd_tx.c b/lnet/klnds/ptllnd/ptllnd_tx.c new file mode 100644 index 0000000..e69fdec --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_tx.c @@ -0,0 +1,516 @@ +#include "ptllnd.h" + + +void +kptllnd_tx_schedule (kptl_tx_t *tx); + + + +int +kptllnd_setup_tx_descs (kptl_data_t *kptllnd_data) +{ + kptl_tx_t *tx; + int i; + + PJK_UT_MSG("\n"); + + /* + * First initialize the tx descriptors + */ + memset(kptllnd_data->kptl_tx_descs, 0, + PTLLND_TX_MSGS() * sizeof(kptl_tx_t)); + + for (i = 0; i < PTLLND_TX_MSGS(); i++) { + tx = &kptllnd_data->kptl_tx_descs[i]; + + + kptllnd_posted_object_setup(&tx->tx_po, + kptllnd_data, + POSTED_OBJECT_TYPE_TX); + + CFS_INIT_LIST_HEAD(&tx->tx_list); + CFS_INIT_LIST_HEAD(&tx->tx_schedlist); + + /* + * Determine if this is a regular or reserved descriptor + */ + tx->tx_isnblk = (i >= *kptllnd_tunables.kptl_ntx); + + /* + * Set the state + */ + tx->tx_state = TX_STATE_ON_IDLE_QUEUE; + + PORTAL_ALLOC( tx->tx_msg, *kptllnd_tunables.kptl_max_immd_size ); + if(tx->tx_msg == 0){ + CERROR("Failed to allocate TX payload\n"); + kptllnd_cleanup_tx_descs(kptllnd_data); + } + + + /* + * Add this to the correct queue + */ + if (tx->tx_isnblk) + list_add (&tx->tx_list, + &kptllnd_data->kptl_idle_nblk_txs); + else + list_add (&tx->tx_list, + &kptllnd_data->kptl_idle_txs); + } + + return (0); +} + +void +kptllnd_cleanup_tx_descs(kptl_data_t *kptllnd_data) +{ + kptl_tx_t *tx; + int i; + + PJK_UT_MSG("\n"); + + for (i = 0; i < PTLLND_TX_MSGS(); i++) { + tx = &kptllnd_data->kptl_tx_descs[i]; + + + /* + * Handle partial initization by stopping + * when we hit one that is not fully initialized + */ + if( tx->tx_msg == 0 ) + break; + + LASSERT( tx->tx_state == TX_STATE_ON_IDLE_QUEUE ); + + PORTAL_FREE(tx->tx_msg,*kptllnd_tunables.kptl_max_immd_size); + } +} + +kptl_tx_t * +kptllnd_get_idle_tx( + kptl_data_t *kptllnd_data, + int may_block, + kptl_tx_type_t purpose) +{ + kptl_tx_t *tx = NULL; + ENTRY; + + PJK_UT_MSG(">>> may_block=%d purpose=%d\n",may_block,purpose); + + if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_PUT_ALLOC ) && purpose == TX_TYPE_LARGE_PUT){ + PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n"); + CERROR ("FAIL_BLOCKING_TX_PUT_ALLOC SIMULATION triggered\n"); + tx = NULL; + STAT_UPDATE(kpt_tx_allocation_failed); + goto exit; + } + if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX_GET_ALLOC ) && purpose == TX_TYPE_LARGE_GET){ + PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n"); + CERROR ("FAIL_BLOCKING_TX_GET_ALLOC SIMULATION triggered\n"); + tx = NULL; + STAT_UPDATE(kpt_tx_allocation_failed); + goto exit; + } + if(IS_SIMULATION_ENABLED( FAIL_BLOCKING_TX )){ + PJK_UT_MSG_SIMULATION("FAIL_BLOCKING_TX SIMULATION triggered\n"); + CERROR ("FAIL_BLOCKING_TX SIMULATION triggered\n"); + tx = NULL; + STAT_UPDATE(kpt_tx_allocation_failed); + goto exit; + } + + while ( !kptllnd_data->kptl_shutdown ) { + + spin_lock(&kptllnd_data->kptl_tx_lock); + + /* "normal" descriptor is free */ + if (!list_empty (&kptllnd_data->kptl_idle_txs)) { + tx = list_entry (kptllnd_data->kptl_idle_txs.next, + kptl_tx_t, tx_list); + break; + } + + if (!may_block) { + /* may dip into reserve pool */ + if (list_empty (&kptllnd_data->kptl_idle_nblk_txs)) { + CERROR ("reserved tx desc pool exhausted\n"); + break; + } + + tx = list_entry (kptllnd_data->kptl_idle_nblk_txs.next, + kptl_tx_t, tx_list); + break; + } + + /* block for idle tx */ + spin_unlock(&kptllnd_data->kptl_tx_lock); + + wait_event (kptllnd_data->kptl_idle_tx_waitq, + !list_empty (&kptllnd_data->kptl_idle_txs) || + kptllnd_data->kptl_shutdown); + } + + if (tx != NULL) { + + /* + * Check the state + */ + LASSERT(tx->tx_state == TX_STATE_ON_IDLE_QUEUE); + + /* + * Reference is now owned by caller + */ + LASSERT(atomic_read(&tx->tx_refcount)== 0); + atomic_set(&tx->tx_refcount,1); + + /* + * Remove it from the idle queue + */ + list_del_init (&tx->tx_list); + + /* + * Set the state and type + */ + tx->tx_state = TX_STATE_ALLOCATED; + tx->tx_type = purpose; + + /* + * Initialize the TX descriptor so that cleanup can be + * handled easily even with a partially initialized descriptor + */ + tx->tx_mdh = PTL_INVALID_HANDLE; + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + tx->tx_ptlmsg = 0; + tx->tx_ptlmsg_reply = 0; + tx->tx_peer = 0; + tx->tx_associated_rx = 0; + + /* + * These must be re-initialized + */ + tx->tx_status = -EINVAL; + tx->tx_seen_send_end = 0; + tx->tx_seen_reply_end = 0; + tx->tx_payload_niov = 0; + tx->tx_payload_iov = 0; + tx->tx_payload_kiov = 0; + tx->tx_payload_offset = 0; + tx->tx_payload_nob = 0; + + STAT_UPDATE(kps_tx_allocated); + }else{ + STAT_UPDATE(kpt_tx_allocation_failed); + } + + spin_unlock(&kptllnd_data->kptl_tx_lock); + +exit: + PJK_UT_MSG("<<< tx=%p\n",tx); + + RETURN(tx); +} + +void +kptllnd_tx_done (kptl_tx_t *tx) +{ + kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data; + LASSERT (!in_interrupt()); + + PJK_UT_MSG(">>> tx=%p\n",tx); + + LASSERT(tx->tx_state != TX_STATE_ON_IDLE_QUEUE); + LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + LASSERT(atomic_read(&tx->tx_refcount) == 0); + LASSERT(list_empty(&tx->tx_schedlist)); /*not any the scheduler list*/ + + if(tx->tx_ptlmsg != 0){ + PJK_UT_MSG("tx=%p finalize\n",tx); + lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg, tx->tx_status); + tx->tx_ptlmsg = 0; + } + if(tx->tx_ptlmsg_reply != 0){ + PJK_UT_MSG("tx=%p finalize reply\n",tx); + lnet_finalize (kptllnd_data->kptl_ni, tx->tx_ptlmsg_reply, tx->tx_status); + tx->tx_ptlmsg_reply = 0; + } + + /* + * Release the associated RX if there is one + */ + if(tx->tx_associated_rx){ + PJK_UT_MSG("tx=%p destroy associated rx %p\n",tx,tx->tx_associated_rx); + kptllnd_rx_decref(tx->tx_associated_rx,"tx",kptllnd_data); + tx->tx_associated_rx =0; + } + + /* + * Cleanup resources associate with the peer + */ + if(tx->tx_peer){ + PJK_UT_MSG("tx=%p detach from peer=%p\n",tx,tx->tx_peer); + kptllnd_peer_dequeue_tx(tx->tx_peer,tx); + kptllnd_peer_decref(tx->tx_peer,"tx"); + tx->tx_peer = NULL; + } + + LASSERT(list_empty(&tx->tx_list)); /* removed from any peer list*/ + + /* + * state = back on idle queue + */ + tx->tx_state = TX_STATE_ON_IDLE_QUEUE; + + /* + * Put this tx descriptor back on the correct idle queue + * If this is a "normal" descriptor then somebody might + * be waiting so wake them up + */ + spin_lock(&kptllnd_data->kptl_tx_lock); + + if (tx->tx_isnblk) { + list_add (&tx->tx_list, &kptllnd_data->kptl_idle_nblk_txs); + } else { + list_add (&tx->tx_list, &kptllnd_data->kptl_idle_txs); + wake_up (&kptllnd_data->kptl_idle_tx_waitq); + } + + STAT_UPDATE(kps_tx_released); + + spin_unlock(&kptllnd_data->kptl_tx_lock); + + PJK_UT_MSG("<<< tx=%p\n",tx); +} + +void +kptllnd_tx_schedule (kptl_tx_t *tx) +{ + kptl_data_t *kptllnd_data = tx->tx_po.po_kptllnd_data; + unsigned long flags; + + PJK_UT_MSG("tx=%p\n",tx); + + spin_lock_irqsave(&kptllnd_data->kptl_sched_lock, flags); + LASSERT(list_empty(&tx->tx_schedlist)); + list_add_tail(&tx->tx_schedlist,&kptllnd_data->kptl_sched_txq); + wake_up(&kptllnd_data->kptl_sched_waitq); + spin_unlock_irqrestore(&kptllnd_data->kptl_sched_lock, flags); +} + +void +kptllnd_tx_callback(ptl_event_t *ev) +{ + kptl_tx_t *tx = ev->md.user_ptr; + kptl_peer_t *peer; + int rc; + int do_decref = 0; + + PJK_UT_MSG(">>> %s(%d) tx=%p fail=%d\n", + get_ev_type_string(ev->type),ev->type,tx,ev->ni_fail_type); + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + PJK_UT_MSG("ev->unlinked=%d\n",ev->unlinked); +#endif + + if(ev->type == PTL_EVENT_UNLINK ){ +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + /* + * Ignore unlink events if we don't + * have lustre semantics as these only occur + * in one-to-one correspondence with OPXXX_END + * event's and we've already cleaned up in + * those cases. + */ + PJK_UT_MSG("<<<\n"); + return; +#else + /* + * Clear the handles + */ + if(PtlHandleIsEqual(ev->md_handle,tx->tx_mdh)) + tx->tx_mdh = PTL_INVALID_HANDLE; + else if (PtlHandleIsEqual(ev->md_handle,tx->tx_mdh_msg)) + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + + tx->tx_status = -EINVAL; + kptllnd_tx_scheduled_decref(tx); + PJK_UT_MSG("<<<\n"); + return; +#endif + } + + LASSERT(tx->tx_peer != NULL); + peer = tx->tx_peer; + + spin_lock(&peer->peer_lock); + + /* + * Save the status flag + */ + tx->tx_status = ev->ni_fail_type == PTL_NI_OK ? 0 : -EINVAL; + + switch(ev->type) + { + case PTL_EVENT_SEND_END: + + /* + * Mark that we've seen an SEND END + */ + tx->tx_seen_send_end = 1; + + switch(tx->tx_type) + { + default: + LBUG(); + break; + + case TX_TYPE_SMALL_MESSAGE: + PJK_UT_MSG("TX_TYPE_SMALL_MESSAGE\n"); + LASSERT(PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)); + + /* + * Success or failure we are done with the Message MD + */ + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + do_decref = 1; + break; + + case TX_TYPE_LARGE_PUT: + case TX_TYPE_LARGE_GET: + PJK_UT_MSG("TX_TYPE_LARGE_%s\n", + tx->tx_type == TX_TYPE_LARGE_PUT ? + "PUT" : "GET"); + /* + * Success or failure we are done with the Message MD + */ + tx->tx_mdh_msg = PTL_INVALID_HANDLE; + + /* + * There was an error, and we're not going to make any more + * progress (obviously) and the + * PUT_END or GET_END is never going to come. + */ + if(ev->ni_fail_type != PTL_NI_OK ){ + + /* + * There was a error in the message + * we can safely unlink the MD + * + */ + if(!PtlHandleIsEqual(tx->tx_mdh,PTL_INVALID_HANDLE)){ + LASSERT(atomic_read(&tx->tx_refcount)>1); + rc = PtlMDUnlink(tx->tx_mdh); + LASSERT(rc == 0); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + tx->tx_mdh = PTL_INVALID_HANDLE; + /* + * We are holding another reference + * so this is not going to do anything + * but decrement the tx->ref_count + */ + kptllnd_tx_decref(tx); +#endif + } + } + + do_decref = 1; + break; + + case TX_TYPE_LARGE_PUT_RESPONSE: + PJK_UT_MSG("TX_TYPE_LARGE_PUT_RESPONSE\n"); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + + /* + * If'we've already seen the reply end + * or if this is a failure and we're NEVER going + * to see the reply end, release our reference here + */ + if(tx->tx_seen_reply_end || ev->ni_fail_type != PTL_NI_OK){ + tx->tx_mdh = PTL_INVALID_HANDLE; + do_decref = 1; + } + break; + + case TX_TYPE_LARGE_GET_RESPONSE: + PJK_UT_MSG("TX_TYPE_LARGE_GET_RESPONSE\n"); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + + /* + * Success or failure we are done with the MD + */ + tx->tx_mdh = PTL_INVALID_HANDLE; + do_decref = 1; + break; + } + break; + + case PTL_EVENT_GET_END: + LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + tx->tx_mdh = PTL_INVALID_HANDLE; + do_decref = 1; + break; + case PTL_EVENT_PUT_END: + LASSERT(tx->tx_type == TX_TYPE_LARGE_GET); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + tx->tx_mdh = PTL_INVALID_HANDLE; + do_decref = 1; + break; + case PTL_EVENT_REPLY_END: + LASSERT(tx->tx_type == TX_TYPE_LARGE_PUT_RESPONSE); + LASSERT(PtlHandleIsEqual(tx->tx_mdh_msg,PTL_INVALID_HANDLE)); + tx->tx_seen_reply_end = 1; + if(tx->tx_seen_send_end){ + tx->tx_mdh = PTL_INVALID_HANDLE; + do_decref = 1; + } + break; + default: + LBUG(); + } + + spin_unlock(&peer->peer_lock); + + if(do_decref) + kptllnd_tx_scheduled_decref(tx); + PJK_UT_MSG("<<< decref=%d\n",do_decref); +} + +void +kptllnd_tx_addref( + kptl_tx_t *tx) +{ + atomic_inc(&tx->tx_refcount); +} + +void +kptllnd_tx_decref( + kptl_tx_t *tx) +{ + if( !atomic_dec_and_test(&tx->tx_refcount)){ + return; + } + + PJK_UT_MSG("tx=%p LAST REF\n",tx); + kptllnd_tx_done(tx); +} + +void +kptllnd_tx_scheduled_decref( + kptl_tx_t *tx) +{ + if( !atomic_dec_and_test(&tx->tx_refcount)){ + /* + * The below message could actually be out of sync + * with the real ref count, and is for informational purposes + * only + */ + PJK_UT_MSG("tx=%p count=%d\n",tx, + atomic_read(&tx->tx_refcount)); + return; + } + + PJK_UT_MSG("tx=%p LAST REF\n",tx); + kptllnd_tx_schedule(tx); +} diff --git a/lnet/klnds/ptllnd/ptllnd_wire.h b/lnet/klnds/ptllnd/ptllnd_wire.h new file mode 100644 index 0000000..3b0df56 --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_wire.h @@ -0,0 +1,56 @@ +/************************************************************************ + * Portal NAL Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +typedef struct +{ + lnet_hdr_t kptlim_hdr; /* portals header */ + char kptlim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kptl_immediate_msg_t; + +typedef struct +{ + lnet_hdr_t kptlrm_hdr; /* portals header */ + __u64 kptlrm_matchbits; /* matchbits */ +} WIRE_ATTR kptl_request_msg_t; + +typedef struct +{ + __u64 kptlhm_matchbits; /* matchbits */ + __u32 kptlhm_max_immd_size; /* immd message size */ +} WIRE_ATTR kptl_hello_msg_t; + +typedef struct kptl_msg +{ + /* First 2 fields fixed FOR ALL TIME */ + __u32 ptlm_magic; /* I'm an ptl NAL message */ + __u16 ptlm_version; /* this is my version number */ + __u8 ptlm_type; /* the message type */ + __u8 ptlm_credits; /* returned credits */ + __u32 ptlm_nob; /* # bytes in whole message */ + __u32 ptlm_cksum; /* checksum (0 == no checksum) */ + __u64 ptlm_srcnid; /* sender's NID */ + __u64 ptlm_srcstamp; /* sender's incarnation */ + __u64 ptlm_dstnid; /* destination's NID */ + __u64 ptlm_dststamp; /* destination's incarnation */ + __u64 ptlm_seq; /* sequence number */ + + union { + kptl_immediate_msg_t immediate; + kptl_request_msg_t req; + kptl_hello_msg_t hello; + } WIRE_ATTR ptlm_u; + +}kptl_msg_t; + +#define PTLLND_MSG_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define PTLLND_MSG_VERSION 0x01 + +#define PTLLND_MSG_TYPE_INVALID 0x00 +#define PLTLND_MSG_TYPE_PUT 0x01 +#define PTLLND_MSG_TYPE_GET 0x02 +#define PTLLND_MSG_TYPE_IMMEDIATE 0x03 /* No bulk data xfer*/ +#define PTLLND_MSG_TYPE_NOOP 0x04 +#define PTLLND_MSG_TYPE_HELLO 0x05 + diff --git a/lnet/klnds/ptllnd/wirecheck.c b/lnet/klnds/ptllnd/wirecheck.c new file mode 100644 index 0000000..e40a043 --- /dev/null +++ b/lnet/klnds/ptllnd/wirecheck.c @@ -0,0 +1,183 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include + +#include + +/* This ghastly hack to allows me to include lib-types.h It doesn't affect any + * assertions generated here (but fails-safe if it ever does) */ +typedef struct { + int counter; +} atomic_t; + +#include + +#include "ptlnal_wire.h" + +#ifndef HAVE_STRNLEN +#define strnlen(s, i) strlen(s) +#endif + +#define BLANK_LINE() \ +do { \ + printf ("\n"); \ +} while (0) + +#define COMMENT(c) \ +do { \ + printf (" /* "c" */\n"); \ +} while (0) + +#undef STRINGIFY +#define STRINGIFY(a) #a + +#define CHECK_DEFINE(a) \ +do { \ + printf (" CLASSERT ("#a" == "STRINGIFY(a)");\n"); \ +} while (0) + +#define CHECK_VALUE(a) \ +do { \ + printf (" CLASSERT ("#a" == %d);\n", a); \ +} while (0) + +#define CHECK_MEMBER_OFFSET(s,m) \ +do { \ + CHECK_VALUE((int)offsetof(s, m)); \ +} while (0) + +#define CHECK_MEMBER_SIZEOF(s,m) \ +do { \ + CHECK_VALUE((int)sizeof(((s *)0)->m)); \ +} while (0) + +#define CHECK_MEMBER(s,m) \ +do { \ + CHECK_MEMBER_OFFSET(s, m); \ + CHECK_MEMBER_SIZEOF(s, m); \ +} while (0) + +#define CHECK_STRUCT(s) \ +do { \ + BLANK_LINE (); \ + COMMENT ("Checks for struct "#s); \ + CHECK_VALUE((int)sizeof(s)); \ +} while (0) + +void +system_string (char *cmdline, char *str, int len) +{ + int fds[2]; + int rc; + pid_t pid; + + rc = pipe (fds); + if (rc != 0) + abort (); + + pid = fork (); + if (pid == 0) { + /* child */ + int fd = fileno(stdout); + + rc = dup2(fds[1], fd); + if (rc != fd) + abort(); + + exit(system(cmdline)); + /* notreached */ + } else if ((int)pid < 0) { + abort(); + } else { + FILE *f = fdopen (fds[0], "r"); + + if (f == NULL) + abort(); + + close(fds[1]); + + if (fgets(str, len, f) == NULL) + abort(); + + if (waitpid(pid, &rc, 0) != pid) + abort(); + + if (!WIFEXITED(rc) || + WEXITSTATUS(rc) != 0) + abort(); + + if (strnlen(str, len) == len) + str[len - 1] = 0; + + if (str[strlen(str) - 1] == '\n') + str[strlen(str) - 1] = 0; + + fclose(f); + } +} + +int +main (int argc, char **argv) +{ + char unameinfo[80]; + char gccinfo[80]; + + system_string("uname -a", unameinfo, sizeof(unameinfo)); + system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo)); + + printf ("void vibnal_assert_wire_constants (void)\n" + "{\n" + " /* Wire protocol assertions generated by 'wirecheck'\n" + " * running on %s\n" + " * with %s */\n" + "\n", unameinfo, gccinfo); + + BLANK_LINE (); + + COMMENT ("Constants..."); + CHECK_DEFINE (PTLLND_MSG_MAGIC); + CHECK_DEFINE (PTLLND_MSG_VERSION); + + CHECK_DEFINE (PTLLND_MSG_TYPE_INVALID); + CHECK_DEFINE (PLTLND_MSG_TYPE_PUT); + CHECK_DEFINE (PTLLND_MSG_TYPE_GET); + CHECK_DEFINE (PTLLND_MSG_TYPE_IMMEDIATE); + CHECK_DEFINE (PTLLND_MSG_TYPE_NOOP); + CHECK_DEFINE (PTLLND_MSG_TYPE_HELLO); + + CHECK_STRUCT (kptl_msg_t); + CHECK_MEMBER (kptl_msg_t, ptlm_magic); + CHECK_MEMBER (kptl_msg_t, ptlm_version); + CHECK_MEMBER (kptl_msg_t, ptlm_type); + CHECK_MEMBER (kptl_msg_t, ptlm_credits); + CHECK_MEMBER (kptl_msg_t, ptlm_nob); + CHECK_MEMBER (kptl_msg_t, ptlm_cksum); + CHECK_MEMBER (kptl_msg_t, ptlm_srcnid); + CHECK_MEMBER (kptl_msg_t, ptlm_srcstamp); + CHECK_MEMBER (kptl_msg_t, ptlm_dstnid); + CHECK_MEMBER (kptl_msg_t, ptlm_dststamp); + CHECK_MEMBER (kptl_msg_t, ptlm_seq); + CHECK_MEMBER (kib_msg_t, ptlm_u.immediate); + CHECK_MEMBER (kib_msg_t, ptlm_u.req); + CHECK_MEMBER (kib_msg_t, ptlm_u.hello); + + CHECK_STRUCT (kptl_immediate_msg_t); + CHECK_MEMBER (kptl_immediate_msg_t, kptlim_hdr); + CHECK_MEMBER (kptl_immediate_msg_t, kptlim_payload[13]); + + CHECK_STRUCT (kptl_request_msg_t); + CHECK_MEMBER (kptl_request_msg_t, kptlrm_hdr); + CHECK_MEMBER (kptl_request_msg_t, kptlrm_matchbits); + + CHECK_STRUCT (kptl_hello_msg_t); + CHECK_MEMBER (kptl_hello_msg_t, kptlhm_matchbits); + CHECK_MEMBER (kptl_hello_msg_t, kptlhm_max_immd_size); + + printf ("}\n\n"); + + return (0); +} diff --git a/lnet/libcfs/nidstrings.c b/lnet/libcfs/nidstrings.c index 1f0c9fd..e12f75c 100644 --- a/lnet/libcfs/nidstrings.c +++ b/lnet/libcfs/nidstrings.c @@ -137,6 +137,11 @@ static struct netstrfns libcfs_netstrfns[] = { .nf_modname = "kgmlnd", .nf_addr2str = libcfs_num_addr2str, .nf_str2addr = libcfs_num_str2addr}, + {.nf_type = PTLLND, + .nf_name = "ptl", + .nf_modname = "kptllnd", + .nf_addr2str = libcfs_num_addr2str, + .nf_str2addr = libcfs_num_str2addr}, /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */ {.nf_type = -1}, }; diff --git a/lnet/tests/ut.h b/lnet/tests/ut.h index c6dc4b4..96ccb34 100644 --- a/lnet/tests/ut.h +++ b/lnet/tests/ut.h @@ -25,6 +25,7 @@ const char *get_ev_type_string(int evtype) } static volatile int seen = 0; +static volatile int seen_unlink = 0; static inline void handler(lnet_event_t *ev) { @@ -40,4 +41,5 @@ static inline void handler(lnet_event_t *ev) PJK_UT_MSG("md.user_ptr=%p\n",ev->md.user_ptr); PJK_UT_MSG("-------- EVENT END --------------\n"); ++seen; + if(ev->unlinked)++seen_unlink; } diff --git a/lnet/tests/ut_cli.c b/lnet/tests/ut_cli.c index 9655985..33088c7 100644 --- a/lnet/tests/ut_cli.c +++ b/lnet/tests/ut_cli.c @@ -1,8 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ - - #define UT_MSG_MODULE_NAME "utcli " + + #define UT_MSG_MODULE_NAME "utcli " #include "ut.h" int pkt_size = 300; @@ -46,7 +46,7 @@ static int __init utcli_init(void) PORTAL_ALLOC (buffer, pkt_size); if (buffer == NULL) { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size); + CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size); return -ENOMEM; } @@ -100,9 +100,9 @@ static int __init utcli_init(void) target.pid = 0; target.nid = libcfs_str2nid(nid); - + PJK_UT_MSG("target.nid="LPX64"\n",target.nid); - + for(i=0;i<1;i++) { if(get){ @@ -141,7 +141,7 @@ static int __init utcli_init(void) while(i++ < 10 && seen == 0) cfs_pause(cfs_time_seconds(1)); if(seen == 0) - PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); + PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); else{ int good; if(get){ @@ -153,12 +153,12 @@ static int __init utcli_init(void) }else{ good = 1; } - + if(good) PJK_UT_MSG("------------------COMPLETE--------------------\n"); else - PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); - } + PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); + } @@ -172,6 +172,15 @@ static int __init utcli_init(void) exit5: PJK_UT_MSG("LNetMDUnlink()\n"); LNetMDUnlink(mdh); + + if(!seen_unlink){ + PJK_UT_MSG("------------Waiting for UNLINK ------------\n"); + i=0; + while(i++ < 120 && seen_unlink == 0) + cfs_pause(cfs_time_seconds(1)); + } + + cfs_pause(cfs_time_seconds(1)); exit4: PJK_UT_MSG("LNetEQFree()\n"); LNetEQFree(eqh); @@ -189,7 +198,7 @@ exit0: static void /*__exit*/ utcli_cleanup(void) { PJK_UT_MSG(">>>\n"); - PJK_UT_MSG("<<<\n"); + PJK_UT_MSG("<<<\n"); } /* utcli_cleanup() */ diff --git a/lnet/tests/ut_srv.c b/lnet/tests/ut_srv.c index 2341681..73a3c7b 100644 --- a/lnet/tests/ut_srv.c +++ b/lnet/tests/ut_srv.c @@ -3,7 +3,7 @@ */ -#define UT_MSG_MODULE_NAME "utsrv " +#define UT_MSG_MODULE_NAME "utsrv " #include "ut.h" @@ -23,32 +23,32 @@ static int __init utsrv_init(void) lnet_process_id_t anypid; lnet_process_id_t mypid; lnet_md_t md; - + PJK_UT_MSG(">>>\n"); - PJK_UT_MSG("pkt_size=%d\n",pkt_size); + PJK_UT_MSG("pkt_size=%d\n",pkt_size); PJK_UT_MSG("auto_unlink=%d\n",auto_unlink); - + PJK_UT_MSG("PORTAL_ALLOC\n"); PORTAL_ALLOC (buffer, pkt_size); if (buffer == NULL) { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", pkt_size); + CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size); rc = -ENOMEM; goto exit0; - } - + } + PJK_UT_MSG("LNetNiInit()\n"); rc = LNetNIInit(0); if (rc < 0) { CERROR ("LNetNIInit: error %d\n", rc); goto exit1; - } - + } + LNetGetId(0,&mypid); PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid); - PJK_UT_MSG("my.pid=0x%x\n",mypid.pid); - + PJK_UT_MSG("my.pid=0x%x\n",mypid.pid); + PJK_UT_MSG("LNetEQAlloc\n"); rc = LNetEQAlloc( 64, /* max number of envents why 64? */ @@ -57,8 +57,8 @@ static int __init utsrv_init(void) if(rc != 0) { CERROR("LNetEQAlloc failed %d\n",rc); goto exit2; - } - + } + anypid.nid = LNET_NID_ANY; anypid.pid = LNET_PID_ANY; @@ -75,8 +75,8 @@ static int __init utsrv_init(void) if(rc != 0) { CERROR("LNetMeAttach failed %d\n",rc); goto exit3; - } - + } + md.start = buffer; md.length = pkt_size; md.threshold = auto_unlink ? 1 : 100; @@ -97,26 +97,26 @@ static int __init utsrv_init(void) if(rc != 0){ CERROR("LNetMDAttach failed %d\n",rc); goto exit4; - } - + } + rc = 0; goto exit0; - -exit4: + +exit4: PJK_UT_MSG("LNetMEUnlink()\n"); LNetMEUnlink(meh); -exit3: +exit3: PJK_UT_MSG("LNetEQFree()\n"); LNetEQFree(eqh); exit2: PJK_UT_MSG("LNetNiFini()\n"); LNetNIFini(); exit1: - PORTAL_FREE(buffer,pkt_size); + PORTAL_FREE(buffer,pkt_size); exit0: PJK_UT_MSG("<<< rc=%d\n",rc); return rc; - + } /* utsrv_init() */ @@ -126,12 +126,12 @@ static void /*__exit*/ utsrv_cleanup(void) PJK_UT_MSG("LNetMDUnlink()\n"); LNetMDUnlink(mdh); PJK_UT_MSG("LNetMEUnlink()\n"); - LNetMEUnlink(meh); + LNetMEUnlink(meh); PJK_UT_MSG("LNetEQFree()\n"); - LNetEQFree(eqh); + LNetEQFree(eqh); PJK_UT_MSG("LNetNiFini()\n"); LNetNIFini(); - PORTAL_FREE(buffer,pkt_size); + PORTAL_FREE(buffer,pkt_size); PJK_UT_MSG("<<<\n"); } /* utsrv_cleanup() */