LU-17839 kfilnd: Wait for hello response to mark peer uptodate

author Chris Horn <chris.horn@hpe.com>

Wed, 15 Nov 2023 19:22:24 +0000 (12:22 -0700)

committer Oleg Drokin <green@whamcloud.com>

Wed, 29 May 2024 04:49:41 +0000 (04:49 +0000)
author Chris Horn <chris.horn@hpe.com>
Wed, 15 Nov 2023 19:22:24 +0000 (12:22 -0700)
committer Oleg Drokin <green@whamcloud.com>
Wed, 29 May 2024 04:49:41 +0000 (04:49 +0000)
diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h

index 92b1a75..816337a 100644 (file)
--- a/lnet/klnds/kfilnd/kfilnd.h
+++ b/lnet/klnds/kfilnd/kfilnd.h
@@ -105,6 +105,8 @@
  #define CFS_KFI_FAIL_WAIT_SEND_COMP2 0xF116
  #define CFS_KFI_FAIL_WAIT_SEND_COMP3 0xF117
  #define CFS_KFI_REPLAY_IDLE_EVENT 0xF118
+#define CFS_KFI_REPLAY_RX_HELLO_REQ 0xF119
+#define CFS_KFI_FAIL_MSG_TYPE_EAGAIN 0xF11A
  
  /* Maximum number of transaction keys supported. */
  #define KFILND_EP_KEY_BITS 16U
@@ -241,6 +243,12 @@ struct kfilnd_ep {
  #define KP_STATE_STALE 0x3
  /* We suspect this peer is actually down or otherwise unreachable */
  #define KP_STATE_DOWN 0x4
+/* We received a HELLO request from a new peer, and are waiting
+ * for the response to our HELLO request. We can handle RX events for
+ * such a peer, but we will throttle sends to this peer until it is
+ * up-to-date
+ */
+#define KP_STATE_WAIT_RSP 0x5
  
  struct kfilnd_peer {
         struct rhash_head kp_node;
@@ -286,11 +294,12 @@ static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *kp)
         return atomic_read(&kp->kp_state) == KP_STATE_NEW;
  }
  
+/* We need to throttle messages if the peer is not up-to-date or stale */
  static inline bool kfilnd_peer_needs_throttle(struct kfilnd_peer *kp)
  {
         unsigned int kp_state = atomic_read(&kp->kp_state);
  
-       return (kp_state == KP_STATE_NEW || kp_state == KP_STATE_DOWN);
+       return !(kp_state == KP_STATE_UPTODATE || kp_state == KP_STATE_STALE);
  }
  
  /* Peer needs hello if it is not up to date and there is not already a hello
diff --git a/lnet/klnds/kfilnd/kfilnd_ep.c b/lnet/klnds/kfilnd/kfilnd_ep.c

index 904295a..18f53fc 100644 (file)
--- a/lnet/klnds/kfilnd/kfilnd_ep.c
+++ b/lnet/klnds/kfilnd/kfilnd_ep.c
@@ -406,7 +406,9 @@ int kfilnd_ep_post_send(struct kfilnd_ep *ep, struct kfilnd_transaction *tn)
                         return 0;
         } else if (CFS_FAIL_CHECK(CFS_KFI_FAIL_SEND)) {
                 return -EIO;
-       } else if (CFS_FAIL_CHECK(CFS_KFI_FAIL_SEND_EAGAIN)) {
+       } else if (CFS_FAIL_CHECK_VALUE(CFS_KFI_FAIL_MSG_TYPE_EAGAIN,
+                                        tn->tn_tx_msg.msg->type) ||
+                  CFS_FAIL_CHECK(CFS_KFI_FAIL_SEND_EAGAIN)) {
                 return -EAGAIN;
         }
  
diff --git a/lnet/klnds/kfilnd/kfilnd_peer.c b/lnet/klnds/kfilnd/kfilnd_peer.c

index 2c8ab95..6c66739 100644 (file)
--- a/lnet/klnds/kfilnd/kfilnd_peer.c
+++ b/lnet/klnds/kfilnd/kfilnd_peer.c
@@ -364,21 +364,17 @@ void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg)
                        libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
                        msg->proto.hello.version, KFILND_MSG_VERSION,
                        kp->kp_version);
-       } else {
+               if (atomic_cmpxchg(&kp->kp_state, KP_STATE_NEW,
+                                  KP_STATE_WAIT_RSP) == KP_STATE_NEW)
+                       CDEBUG(D_NET, "Peer %s(%p):0x%llx new -> wait response\n",
+                              libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+       } else if (msg->type == KFILND_MSG_HELLO_RSP) {
                 kp->kp_version = msg->proto.hello.version;
-               CDEBUG(D_NET,"Peer %s(%p):0x%llx negotiated version: %u\n",
+               atomic_set(&kp->kp_state, KP_STATE_UPTODATE);
+               CDEBUG(D_NET,
+                      "Peer %s(%p):0x%llx is up-to-date negotiated version: %u\n",
                        libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
                        msg->proto.hello.version);
-       }
-
-       atomic_set(&kp->kp_state, KP_STATE_UPTODATE);
-       CDEBUG(D_NET, "kp %s(%p):0x%llx is up-to-date\n",
-              libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
-
-       /* Clear kp_hello_state if we've received the hello response,
-        * otherwise this is an incoming hello request and we may have our
-        * own hello request to this peer still outstanding
-        */
-       if (msg->type == KFILND_MSG_HELLO_RSP)
                 kfilnd_peer_clear_hello_state(kp);
+       }
  }
diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c

index eba1077..2ab71e8 100644 (file)
--- a/lnet/klnds/kfilnd/kfilnd_tn.c
+++ b/lnet/klnds/kfilnd/kfilnd_tn.c
@@ -846,6 +846,22 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
  
                 switch (msg->type) {
                 case KFILND_MSG_HELLO_REQ:
+                       if (CFS_FAIL_CHECK(CFS_KFI_REPLAY_RX_HELLO_REQ)) {
+                               CDEBUG(D_NET, "Replay RX HELLO_REQ\n");
+                               return -EAGAIN;
+                       }
+
+                       if (kfilnd_peer_is_new_peer(tn->tn_kp)) {
+                               rc = kfilnd_send_hello_request(tn->tn_ep->end_dev,
+                                                              tn->tn_ep->end_cpt,
+                                                              tn->tn_kp);
+                               if (rc)
+                                       KFILND_TN_ERROR(tn,
+                                                       "Failed to send hello request: rc=%d",
+                                                       rc);
+                               rc = 0;
+                       }
+
                         kfilnd_peer_process_hello(tn->tn_kp, msg);
                         tn->tn_target_addr = kfilnd_peer_get_kfi_addr(tn->tn_kp);
                         KFILND_TN_DEBUG(tn, "Using peer %s(%#llx)",
author	Chris Horn <chris.horn@hpe.com>
	Wed, 15 Nov 2023 19:22:24 +0000 (12:22 -0700)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 29 May 2024 04:49:41 +0000 (04:49 +0000)
lnet/klnds/kfilnd/kfilnd.h		patch \| blob \| history
lnet/klnds/kfilnd/kfilnd_ep.c		patch \| blob \| history
lnet/klnds/kfilnd/kfilnd_peer.c		patch \| blob \| history
lnet/klnds/kfilnd/kfilnd_tn.c		patch \| blob \| history