Whamcloud - gitweb
* bug 11659 fix - finer-grained peerstamps for ptllnd connection
authoreeb <eeb>
Thu, 22 Mar 2007 17:52:42 +0000 (17:52 +0000)
committereeb <eeb>
Thu, 22 Mar 2007 17:52:42 +0000 (17:52 +0000)
    establishment and better credit flow checks.

lnet/ChangeLog
lnet/klnds/ptllnd/ptllnd.c
lnet/klnds/ptllnd/ptllnd.h
lnet/klnds/ptllnd/ptllnd_peer.c
lnet/klnds/ptllnd/ptllnd_rx_buf.c

index a5aeb91..0bbc28c 100644 (file)
@@ -14,6 +14,15 @@ TBD         Cluster File Systems, Inc. <info@clusterfs.com>
        * bug fixes
 
 Severity   : major
+Frequency  : after Ptllnd timeouts and portals congestion
+Bugzilla   : 11659
+Description: Credit overflows
+Details    : This was a bug in ptllnd connection establishment.  The fix
+             implements better peer stamps to disambiguate connection
+            establishment and ensure both peers enter the credit flow
+            state machine consistently.
+
+Severity   : major
 Frequency  : rare      
 Bugzilla   : 11394
 Description: kptllnd didn't propagate some network errors up to LNET 
index a82babe..5723c8a 100755 (executable)
@@ -198,7 +198,7 @@ kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer)
         /* msg->ptlm_nob   Filled in kptllnd_init_msg()  */
         msg->ptlm_cksum    = 0;
         msg->ptlm_srcnid   = kptllnd_data.kptl_ni->ni_nid;
-        msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation;
+        msg->ptlm_srcstamp = peer->peer_myincarnation;
         msg->ptlm_dstnid   = peer->peer_id.nid;
         msg->ptlm_dststamp = peer->peer_incarnation;
         msg->ptlm_srcpid   = the_lnet.ln_pid;
@@ -527,9 +527,9 @@ kptllnd_startup (lnet_ni_t *ni)
                kptllnd_ptlid2str(kptllnd_data.kptl_portals_id),
                libcfs_nid2str(ni->ni_nid));
 
-        /*
-         * Initialized the incarnation
-         */
+        /* Initialized the incarnation - it must be for-all-time unique, even
+         * accounting for the fact that we increment it when we disconnect a
+         * peer that's using it */
         do_gettimeofday(&tv);
         kptllnd_data.kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) +
                                         tv.tv_usec;
index 4072a41..598c4b8 100755 (executable)
@@ -217,9 +217,11 @@ struct kptl_peer
         lnet_process_id_t       peer_id;                /* Peer's LNET id */
         ptl_process_id_t        peer_ptlid;             /* Peer's portals id */
         __u64                   peer_incarnation;       /* peer's incarnation */
+        __u64                   peer_myincarnation;     /* my incarnation at HELLO */
         int                     peer_sent_hello;        /* have I sent HELLO? */
         int                     peer_credits;           /* number of send credits */
-        int                     peer_outstanding_credits;/* number of peer credits */
+        int                     peer_outstanding_credits;/* number of peer credits to return */
+        int                     peer_active_rxs;        /* # rx-es being handled */
         int                     peer_error;             /* errno on closing this peer */
         cfs_time_t              peer_last_alive;        /* when (in jiffies) I was last alive */
         __u64                   peer_next_matchbits;    /* Next value to register RDMA from peer */
index f8999e0..0f9e7e0 100644 (file)
@@ -169,11 +169,14 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid)
         peer->peer_credits = 1;                 /* enough for HELLO */
         peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
         peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1;
+        peer->peer_active_rxs = 0;
 
         atomic_set(&peer->peer_refcount, 1);    /* 1 ref for caller */
 
         write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
 
+        peer->peer_myincarnation = kptllnd_data.kptl_incarnation;
+
         /* Only increase # peers under lock, to guarantee we dont grow it
          * during shutdown */
         if (kptllnd_data.kptl_shutdown) {
@@ -198,6 +201,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer)
 
         LASSERT (!in_interrupt());
         LASSERT (atomic_read(&peer->peer_refcount) == 0);
+        LASSERT (peer->peer_active_rxs == 0);
         LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
                  peer->peer_state == PEER_STATE_ZOMBIE);
         LASSERT (list_empty(&peer->peer_sendq));
@@ -350,6 +354,11 @@ kptllnd_peer_close_locked(kptl_peer_t *peer, int why)
 
         case PEER_STATE_WAITING_HELLO:
         case PEER_STATE_ACTIVE:
+                /* Ensure new peers see a new incarnation of me */
+                LASSERT(peer->peer_myincarnation <= kptllnd_data.kptl_incarnation);
+                if (peer->peer_myincarnation == kptllnd_data.kptl_incarnation)
+                        kptllnd_data.kptl_incarnation++;
+
                 /* Removing from peer table */
                 kptllnd_data.kptl_n_active_peers--;
                 LASSERT (kptllnd_data.kptl_n_active_peers >= 0);
@@ -946,15 +955,42 @@ kptllnd_peer_handle_hello (ptl_process_id_t  initiator,
                         /* Completing HELLO handshake */
                         LASSERT(peer->peer_incarnation == 0);
 
+                        if (msg->ptlm_dststamp != 0 &&
+                            msg->ptlm_dststamp != peer->peer_myincarnation) {
+                                write_unlock_irqrestore(g_lock, flags);
+
+                                CERROR("Ignoring HELLO from %s: unexpected "
+                                       "dststamp "LPX64" ("LPX64" wanted)\n",
+                                       libcfs_id2str(lpid),
+                                       msg->ptlm_dststamp,
+                                       peer->peer_myincarnation);
+                                kptllnd_peer_decref(peer);
+                                return NULL;
+                        }
+                        
+                        /* Concurrent initiation or response to my HELLO */
                         peer->peer_state = PEER_STATE_ACTIVE;
                         peer->peer_incarnation = msg->ptlm_srcstamp;
                         peer->peer_next_matchbits = safe_matchbits;
-
+                        
                         write_unlock_irqrestore(g_lock, flags);
                         return peer;
                 }
 
-                /* remove old incarnation of this peer */
+                if (msg->ptlm_dststamp != 0 &&
+                    msg->ptlm_dststamp <= peer->peer_myincarnation) {
+                        write_unlock_irqrestore(g_lock, flags);
+
+                        CERROR("Ignoring stale HELLO from %s: "
+                               "dststamp "LPX64" (current "LPX64")\n",
+                               libcfs_id2str(lpid),
+                               msg->ptlm_dststamp,
+                               peer->peer_myincarnation);
+                        kptllnd_peer_decref(peer);
+                        return NULL;
+                }
+
+                /* Brand new connection attempt: remove old incarnation */
                 kptllnd_peer_close_locked(peer, 0);
         }
 
index b83ab51..ad0f05d 100644 (file)
@@ -344,6 +344,9 @@ kptllnd_rx_done(kptl_rx_t *rx)
                 /* Update credits (after I've decref-ed the buffer) */
                 spin_lock_irqsave(&peer->peer_lock, flags);
 
+                peer->peer_active_rxs--;
+                LASSERT (peer->peer_active_rxs >= 0);
+
                 peer->peer_outstanding_credits++;
                 LASSERT (peer->peer_outstanding_credits <=
                          *kptllnd_tunables.kptl_peercredits);
@@ -581,11 +584,8 @@ kptllnd_rx_parse(kptl_rx_t *rx)
 
         if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) {
                 peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg);
-                if (peer == NULL) {
-                        CWARN("No peer for %s\n",
-                              kptllnd_ptlid2str(rx->rx_initiator));
+                if (peer == NULL)
                         goto rx_done;
-                }
         } else {
                 peer = kptllnd_id2peer(srcid);
                 if (peer == NULL) {
@@ -596,61 +596,74 @@ kptllnd_rx_parse(kptl_rx_t *rx)
                         goto rx_done;
                 }
 
-                /* Ignore anything else while I'm waiting for HELLO */
-                if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
+                /* Ignore anything apart from HELLO while I'm waiting for it and
+                 * any messages for a previous incarnation of the connection */
+                if (peer->peer_state == PEER_STATE_WAITING_HELLO ||
+                    msg->ptlm_dststamp < peer->peer_myincarnation) {
                         kptllnd_peer_decref(peer);
                         goto rx_done;
                 }
+
+                if (msg->ptlm_srcstamp != peer->peer_incarnation) {
+                        CERROR("%s: Unexpected srcstamp "LPX64" "
+                               "("LPX64" expected)\n",
+                               libcfs_id2str(peer->peer_id),
+                               msg->ptlm_srcstamp,
+                               peer->peer_incarnation);
+                        rc = -EPROTO;
+                        goto failed;
+                }
+
+                if (msg->ptlm_dststamp != peer->peer_myincarnation) {
+                        CERROR("%s: Unexpected dststamp "LPX64" "
+                               "("LPX64" expected)\n",
+                               libcfs_id2str(peer->peer_id), msg->ptlm_dststamp,
+                               peer->peer_myincarnation);
+                        rc = -EPROTO;
+                        goto failed;
+                }
         }
 
         LASSERT (msg->ptlm_srcnid == peer->peer_id.nid &&
                  msg->ptlm_srcpid == peer->peer_id.pid);
 
-        if (msg->ptlm_srcstamp != peer->peer_incarnation) {
-                CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n",
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        if (peer->peer_active_rxs == *kptllnd_tunables.kptl_peercredits) {
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+                        
+                CERROR("Message overflow from %s: handling %d already\n",
                        libcfs_id2str(peer->peer_id),
-                       msg->ptlm_srcstamp,
-                       peer->peer_incarnation);
+                       *kptllnd_tunables.kptl_peercredits);
                 rc = -EPROTO;
                 goto failed;
         }
+        
+        if (msg->ptlm_credits != 0 &&
+            peer->peer_credits + msg->ptlm_credits >
+            *kptllnd_tunables.kptl_peercredits) {
+                credits = peer->peer_credits;
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
 
-        if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation &&
-            (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */
-             msg->ptlm_dststamp != 0)) {                /* zero dststamp */
-                CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n",
-                       libcfs_id2str(peer->peer_id), msg->ptlm_dststamp,
-                       kptllnd_data.kptl_incarnation);
+                CERROR("Credit overflow from %s: %d + %d > %d\n",
+                       libcfs_id2str(peer->peer_id),
+                       credits, msg->ptlm_credits,
+                       *kptllnd_tunables.kptl_peercredits);
                 rc = -EPROTO;
                 goto failed;
         }
 
-        if (msg->ptlm_credits != 0) {
-                spin_lock_irqsave(&peer->peer_lock, flags);
-
-                if (peer->peer_credits + msg->ptlm_credits >
-                    *kptllnd_tunables.kptl_peercredits) {
-                        credits = peer->peer_credits;
-                        spin_unlock_irqrestore(&peer->peer_lock, flags);
-                        
-                        CERROR("Credit overflow from %s: %d + %d > %d\n",
-                               libcfs_id2str(peer->peer_id),
-                               credits, msg->ptlm_credits,
-                               *kptllnd_tunables.kptl_peercredits);
-                        rc = -EPROTO;
-                        goto failed;
-                }
-                               
-                peer->peer_credits += msg->ptlm_credits;
+        /* ptllnd-level protocol correct: account credits */
+        peer->peer_credits += msg->ptlm_credits;
+        peer->peer_active_rxs++;
 
-                spin_unlock_irqrestore(&peer->peer_lock, flags);
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
 
+        /* See if something can go out now that credits have come in */
+        if (msg->ptlm_credits != 0)
                 kptllnd_peer_check_sends(peer);
-        }
 
-        /* ptllnd-level protocol correct - rx takes my ref on peer and increments
-         * peer_outstanding_credits when it completes */
-        rx->rx_peer = peer;
+        rx->rx_peer = peer;                /* do buffer accounting on rxdone */
         kptllnd_peer_alive(peer);
 
         switch (msg->ptlm_type) {