Whamcloud - gitweb
LU-16989 kfilnd: Handle TX_FAIL in WAIT_SEND_COMP 81/51781/3
authorChris Horn <chris.horn@hpe.com>
Mon, 12 Dec 2022 23:28:54 +0000 (16:28 -0700)
committerOleg Drokin <green@whamcloud.com>
Tue, 22 Aug 2023 06:40:53 +0000 (06:40 +0000)
It is possible for us to get a TN_EVENT_TX_FAIL while transaction is
in TN_STATE_WAIT_SEND_COMP state. We should gracefully handle this
situation rather than LBUG.

Test-Parameters: trivial
HPE-bug-id: LUS-11344
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ib6fc5ed41f12762843fe9f638ffd523699936556
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51781
Tested-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/kfilnd/kfilnd.h
lnet/klnds/kfilnd/kfilnd_ep.c
lnet/klnds/kfilnd/kfilnd_ep.h
lnet/klnds/kfilnd/kfilnd_tn.c
lustre/tests/sanity-lnet.sh

index 9165081..3e26405 100644 (file)
 #define CFS_KFI_FAIL_RECV 0xF112
 #define CFS_KFI_FAIL_MSG_UNPACK 0xF113
 #define CFS_KFI_FAIL_MSG_TYPE 0xF114
+#define CFS_KFI_FAIL_WAIT_SEND_COMP1 0xF115
+#define CFS_KFI_FAIL_WAIT_SEND_COMP2 0xF116
 
 /* Maximum number of transaction keys supported. */
 #define KFILND_EP_KEY_BITS 16U
index ff743ee..d2c677c 100644 (file)
@@ -180,8 +180,8 @@ static void kfilnd_ep_err_fail_loc_work(struct work_struct *work)
        kfree(err);
 }
 
-static int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep,
-                                 const struct kfi_cq_err_entry *err)
+int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep,
+                          const struct kfi_cq_err_entry *err)
 {
        struct kfilnd_ep_err_fail_loc_work *fake_err;
 
index 6b83fdf..c0f0e9c 100644 (file)
@@ -65,6 +65,8 @@ void kfilnd_ep_queue_tn_replay(struct kfilnd_ep *ep,
 
 int kfilnd_ep_get_key(struct kfilnd_ep *ep);
 void kfilnd_ep_put_key(struct kfilnd_ep *ep, unsigned int key);
+int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep,
+                          const struct kfi_cq_err_entry *err);
 
 
 #endif /* _KFILND_EP_ */
index 04b457f..d5b0672 100644 (file)
@@ -1057,6 +1057,11 @@ static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn,
 
        switch (event) {
        case TN_EVENT_TX_OK:
+               if (unlikely(tn->msg_type == KFILND_MSG_BULK_PUT_REQ) &&
+                   CFS_FAIL_CHECK_RESET(CFS_KFI_FAIL_WAIT_SEND_COMP1,
+                                        CFS_KFI_FAIL_WAIT_SEND_COMP2 |
+                                        CFS_FAIL_ONCE))
+                       break;
                kfilnd_peer_alive(tn->tn_kp);
                kfilnd_tn_timeout_enable(tn);
                kfilnd_tn_state_change(tn, TN_STATE_WAIT_TAG_COMP);
@@ -1064,6 +1069,16 @@ static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn,
 
        case TN_EVENT_TAG_RX_OK:
                kfilnd_tn_state_change(tn, TN_STATE_WAIT_SEND_COMP);
+               if (unlikely(tn->msg_type == KFILND_MSG_BULK_PUT_REQ) &&
+                   CFS_FAIL_CHECK(CFS_KFI_FAIL_WAIT_SEND_COMP2)) {
+                       struct kfi_cq_err_entry fake_error = {
+                               .op_context = tn,
+                               .flags = KFI_MSG | KFI_SEND,
+                               .err = EIO,
+                       };
+
+                       kfilnd_ep_gen_fake_err(tn->tn_ep, &fake_error);
+               }
                break;
 
        case TN_EVENT_TX_FAIL:
@@ -1125,14 +1140,22 @@ static int kfilnd_tn_state_wait_send_comp(struct kfilnd_transaction *tn,
        KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event),
                        status);
 
-       if (event == TN_EVENT_TX_OK) {
+       switch (event) {
+       case TN_EVENT_TX_OK:
                kfilnd_peer_alive(tn->tn_kp);
-               kfilnd_tn_finalize(tn, tn_released);
-       } else {
+               break;
+       case TN_EVENT_TX_FAIL:
+               kfilnd_tn_status_update(tn, status,
+                                       LNET_MSG_STATUS_NETWORK_TIMEOUT);
+               kfilnd_peer_tn_failed(tn->tn_kp, status);
+               break;
+       default:
                KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event));
                LBUG();
        }
 
+       kfilnd_tn_finalize(tn, tn_released);
+
        return 0;
 }
 
index 37db599..b634154 100755 (executable)
@@ -3623,6 +3623,26 @@ test_310() {
 }
 run_test 310 "Set timeout and verify"
 
+test_311() {
+       [[ $NETTYPE == kfi* ]] ||
+               skip "Need kfi network type"
+
+       setupall || error "setupall failed"
+
+       mkdir -p $DIR/$tdir || error "mkdir failed"
+       dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 oflag=direct ||
+               error "dd write failed"
+
+       local list=$(comma_list $(osts_nodes))
+
+#define CFS_KFI_FAIL_WAIT_SEND_COMP 0xF115
+       do_nodes $list $LCTL set_param fail_loc=0x8000F115
+       dd if=$DIR/$tdir/$tfile of=/dev/null bs=1M count=1 ||
+               error "dd read failed"
+
+       cleanupall || error "Failed cleanup"
+}
+run_test 311 "Fail bulk put in send wait completion"
 
 check_udsp_prio() {
        local target_net="${1}"