From ebe2426e8ee140c9ee3fef34eef99fb8fd8649ea Mon Sep 17 00:00:00 2001 From: eeb Date: Thu, 22 Mar 2007 17:52:42 +0000 Subject: [PATCH] * bug 11659 fix - finer-grained peerstamps for ptllnd connection establishment and better credit flow checks. --- lnet/ChangeLog | 9 ++++ lnet/klnds/ptllnd/ptllnd.c | 8 ++-- lnet/klnds/ptllnd/ptllnd.h | 4 +- lnet/klnds/ptllnd/ptllnd_peer.c | 40 +++++++++++++++++- lnet/klnds/ptllnd/ptllnd_rx_buf.c | 89 ++++++++++++++++++++++----------------- 5 files changed, 105 insertions(+), 45 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index a5aeb91..0bbc28c 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -14,6 +14,15 @@ TBD Cluster File Systems, Inc. * bug fixes Severity : major +Frequency : after Ptllnd timeouts and portals congestion +Bugzilla : 11659 +Description: Credit overflows +Details : This was a bug in ptllnd connection establishment. The fix + implements better peer stamps to disambiguate connection + establishment and ensure both peers enter the credit flow + state machine consistently. + +Severity : major Frequency : rare Bugzilla : 11394 Description: kptllnd didn't propagate some network errors up to LNET diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index a82babe..5723c8a 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -198,7 +198,7 @@ kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer) /* msg->ptlm_nob Filled in kptllnd_init_msg() */ msg->ptlm_cksum = 0; msg->ptlm_srcnid = kptllnd_data.kptl_ni->ni_nid; - msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation; + msg->ptlm_srcstamp = peer->peer_myincarnation; msg->ptlm_dstnid = peer->peer_id.nid; msg->ptlm_dststamp = peer->peer_incarnation; msg->ptlm_srcpid = the_lnet.ln_pid; @@ -527,9 +527,9 @@ kptllnd_startup (lnet_ni_t *ni) kptllnd_ptlid2str(kptllnd_data.kptl_portals_id), libcfs_nid2str(ni->ni_nid)); - /* - * Initialized the incarnation - */ + /* Initialized the incarnation - it must be for-all-time unique, even + * accounting for the fact that we increment it when we disconnect a + * peer that's using it */ do_gettimeofday(&tv); kptllnd_data.kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 4072a41..598c4b8 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -217,9 +217,11 @@ struct kptl_peer lnet_process_id_t peer_id; /* Peer's LNET id */ ptl_process_id_t peer_ptlid; /* Peer's portals id */ __u64 peer_incarnation; /* peer's incarnation */ + __u64 peer_myincarnation; /* my incarnation at HELLO */ int peer_sent_hello; /* have I sent HELLO? */ int peer_credits; /* number of send credits */ - int peer_outstanding_credits;/* number of peer credits */ + int peer_outstanding_credits;/* number of peer credits to return */ + int peer_active_rxs; /* # rx-es being handled */ int peer_error; /* errno on closing this peer */ cfs_time_t peer_last_alive; /* when (in jiffies) I was last alive */ __u64 peer_next_matchbits; /* Next value to register RDMA from peer */ diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index f8999e0..0f9e7e0 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -169,11 +169,14 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid) peer->peer_credits = 1; /* enough for HELLO */ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1; + peer->peer_active_rxs = 0; atomic_set(&peer->peer_refcount, 1); /* 1 ref for caller */ write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + peer->peer_myincarnation = kptllnd_data.kptl_incarnation; + /* Only increase # peers under lock, to guarantee we dont grow it * during shutdown */ if (kptllnd_data.kptl_shutdown) { @@ -198,6 +201,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer) LASSERT (!in_interrupt()); LASSERT (atomic_read(&peer->peer_refcount) == 0); + LASSERT (peer->peer_active_rxs == 0); LASSERT (peer->peer_state == PEER_STATE_ALLOCATED || peer->peer_state == PEER_STATE_ZOMBIE); LASSERT (list_empty(&peer->peer_sendq)); @@ -350,6 +354,11 @@ kptllnd_peer_close_locked(kptl_peer_t *peer, int why) case PEER_STATE_WAITING_HELLO: case PEER_STATE_ACTIVE: + /* Ensure new peers see a new incarnation of me */ + LASSERT(peer->peer_myincarnation <= kptllnd_data.kptl_incarnation); + if (peer->peer_myincarnation == kptllnd_data.kptl_incarnation) + kptllnd_data.kptl_incarnation++; + /* Removing from peer table */ kptllnd_data.kptl_n_active_peers--; LASSERT (kptllnd_data.kptl_n_active_peers >= 0); @@ -946,15 +955,42 @@ kptllnd_peer_handle_hello (ptl_process_id_t initiator, /* Completing HELLO handshake */ LASSERT(peer->peer_incarnation == 0); + if (msg->ptlm_dststamp != 0 && + msg->ptlm_dststamp != peer->peer_myincarnation) { + write_unlock_irqrestore(g_lock, flags); + + CERROR("Ignoring HELLO from %s: unexpected " + "dststamp "LPX64" ("LPX64" wanted)\n", + libcfs_id2str(lpid), + msg->ptlm_dststamp, + peer->peer_myincarnation); + kptllnd_peer_decref(peer); + return NULL; + } + + /* Concurrent initiation or response to my HELLO */ peer->peer_state = PEER_STATE_ACTIVE; peer->peer_incarnation = msg->ptlm_srcstamp; peer->peer_next_matchbits = safe_matchbits; - + write_unlock_irqrestore(g_lock, flags); return peer; } - /* remove old incarnation of this peer */ + if (msg->ptlm_dststamp != 0 && + msg->ptlm_dststamp <= peer->peer_myincarnation) { + write_unlock_irqrestore(g_lock, flags); + + CERROR("Ignoring stale HELLO from %s: " + "dststamp "LPX64" (current "LPX64")\n", + libcfs_id2str(lpid), + msg->ptlm_dststamp, + peer->peer_myincarnation); + kptllnd_peer_decref(peer); + return NULL; + } + + /* Brand new connection attempt: remove old incarnation */ kptllnd_peer_close_locked(peer, 0); } diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c index b83ab51..ad0f05d 100644 --- a/lnet/klnds/ptllnd/ptllnd_rx_buf.c +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -344,6 +344,9 @@ kptllnd_rx_done(kptl_rx_t *rx) /* Update credits (after I've decref-ed the buffer) */ spin_lock_irqsave(&peer->peer_lock, flags); + peer->peer_active_rxs--; + LASSERT (peer->peer_active_rxs >= 0); + peer->peer_outstanding_credits++; LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits); @@ -581,11 +584,8 @@ kptllnd_rx_parse(kptl_rx_t *rx) if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) { peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg); - if (peer == NULL) { - CWARN("No peer for %s\n", - kptllnd_ptlid2str(rx->rx_initiator)); + if (peer == NULL) goto rx_done; - } } else { peer = kptllnd_id2peer(srcid); if (peer == NULL) { @@ -596,61 +596,74 @@ kptllnd_rx_parse(kptl_rx_t *rx) goto rx_done; } - /* Ignore anything else while I'm waiting for HELLO */ - if (peer->peer_state == PEER_STATE_WAITING_HELLO) { + /* Ignore anything apart from HELLO while I'm waiting for it and + * any messages for a previous incarnation of the connection */ + if (peer->peer_state == PEER_STATE_WAITING_HELLO || + msg->ptlm_dststamp < peer->peer_myincarnation) { kptllnd_peer_decref(peer); goto rx_done; } + + if (msg->ptlm_srcstamp != peer->peer_incarnation) { + CERROR("%s: Unexpected srcstamp "LPX64" " + "("LPX64" expected)\n", + libcfs_id2str(peer->peer_id), + msg->ptlm_srcstamp, + peer->peer_incarnation); + rc = -EPROTO; + goto failed; + } + + if (msg->ptlm_dststamp != peer->peer_myincarnation) { + CERROR("%s: Unexpected dststamp "LPX64" " + "("LPX64" expected)\n", + libcfs_id2str(peer->peer_id), msg->ptlm_dststamp, + peer->peer_myincarnation); + rc = -EPROTO; + goto failed; + } } LASSERT (msg->ptlm_srcnid == peer->peer_id.nid && msg->ptlm_srcpid == peer->peer_id.pid); - if (msg->ptlm_srcstamp != peer->peer_incarnation) { - CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n", + spin_lock_irqsave(&peer->peer_lock, flags); + + if (peer->peer_active_rxs == *kptllnd_tunables.kptl_peercredits) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + + CERROR("Message overflow from %s: handling %d already\n", libcfs_id2str(peer->peer_id), - msg->ptlm_srcstamp, - peer->peer_incarnation); + *kptllnd_tunables.kptl_peercredits); rc = -EPROTO; goto failed; } + + if (msg->ptlm_credits != 0 && + peer->peer_credits + msg->ptlm_credits > + *kptllnd_tunables.kptl_peercredits) { + credits = peer->peer_credits; + spin_unlock_irqrestore(&peer->peer_lock, flags); - if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation && - (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */ - msg->ptlm_dststamp != 0)) { /* zero dststamp */ - CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n", - libcfs_id2str(peer->peer_id), msg->ptlm_dststamp, - kptllnd_data.kptl_incarnation); + CERROR("Credit overflow from %s: %d + %d > %d\n", + libcfs_id2str(peer->peer_id), + credits, msg->ptlm_credits, + *kptllnd_tunables.kptl_peercredits); rc = -EPROTO; goto failed; } - if (msg->ptlm_credits != 0) { - spin_lock_irqsave(&peer->peer_lock, flags); - - if (peer->peer_credits + msg->ptlm_credits > - *kptllnd_tunables.kptl_peercredits) { - credits = peer->peer_credits; - spin_unlock_irqrestore(&peer->peer_lock, flags); - - CERROR("Credit overflow from %s: %d + %d > %d\n", - libcfs_id2str(peer->peer_id), - credits, msg->ptlm_credits, - *kptllnd_tunables.kptl_peercredits); - rc = -EPROTO; - goto failed; - } - - peer->peer_credits += msg->ptlm_credits; + /* ptllnd-level protocol correct: account credits */ + peer->peer_credits += msg->ptlm_credits; + peer->peer_active_rxs++; - spin_unlock_irqrestore(&peer->peer_lock, flags); + spin_unlock_irqrestore(&peer->peer_lock, flags); + /* See if something can go out now that credits have come in */ + if (msg->ptlm_credits != 0) kptllnd_peer_check_sends(peer); - } - /* ptllnd-level protocol correct - rx takes my ref on peer and increments - * peer_outstanding_credits when it completes */ - rx->rx_peer = peer; + rx->rx_peer = peer; /* do buffer accounting on rxdone */ kptllnd_peer_alive(peer); switch (msg->ptlm_type) { -- 1.8.3.1