From: Chris Horn Date: Mon, 12 Dec 2022 23:28:54 +0000 (-0700) Subject: LU-16989 kfilnd: Handle TX_FAIL in WAIT_SEND_COMP X-Git-Tag: 2.15.58~90 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F81%2F51781%2F3;p=fs%2Flustre-release.git LU-16989 kfilnd: Handle TX_FAIL in WAIT_SEND_COMP It is possible for us to get a TN_EVENT_TX_FAIL while transaction is in TN_STATE_WAIT_SEND_COMP state. We should gracefully handle this situation rather than LBUG. Test-Parameters: trivial HPE-bug-id: LUS-11344 Signed-off-by: Chris Horn Change-Id: Ib6fc5ed41f12762843fe9f638ffd523699936556 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51781 Tested-by: Oleg Drokin Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h index 9165081..3e26405 100644 --- a/lnet/klnds/kfilnd/kfilnd.h +++ b/lnet/klnds/kfilnd/kfilnd.h @@ -101,6 +101,8 @@ #define CFS_KFI_FAIL_RECV 0xF112 #define CFS_KFI_FAIL_MSG_UNPACK 0xF113 #define CFS_KFI_FAIL_MSG_TYPE 0xF114 +#define CFS_KFI_FAIL_WAIT_SEND_COMP1 0xF115 +#define CFS_KFI_FAIL_WAIT_SEND_COMP2 0xF116 /* Maximum number of transaction keys supported. */ #define KFILND_EP_KEY_BITS 16U diff --git a/lnet/klnds/kfilnd/kfilnd_ep.c b/lnet/klnds/kfilnd/kfilnd_ep.c index ff743ee..d2c677c 100644 --- a/lnet/klnds/kfilnd/kfilnd_ep.c +++ b/lnet/klnds/kfilnd/kfilnd_ep.c @@ -180,8 +180,8 @@ static void kfilnd_ep_err_fail_loc_work(struct work_struct *work) kfree(err); } -static int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep, - const struct kfi_cq_err_entry *err) +int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep, + const struct kfi_cq_err_entry *err) { struct kfilnd_ep_err_fail_loc_work *fake_err; diff --git a/lnet/klnds/kfilnd/kfilnd_ep.h b/lnet/klnds/kfilnd/kfilnd_ep.h index 6b83fdf..c0f0e9c 100644 --- a/lnet/klnds/kfilnd/kfilnd_ep.h +++ b/lnet/klnds/kfilnd/kfilnd_ep.h @@ -65,6 +65,8 @@ void kfilnd_ep_queue_tn_replay(struct kfilnd_ep *ep, int kfilnd_ep_get_key(struct kfilnd_ep *ep); void kfilnd_ep_put_key(struct kfilnd_ep *ep, unsigned int key); +int kfilnd_ep_gen_fake_err(struct kfilnd_ep *ep, + const struct kfi_cq_err_entry *err); #endif /* _KFILND_EP_ */ diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c index 04b457f..d5b0672 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.c +++ b/lnet/klnds/kfilnd/kfilnd_tn.c @@ -1057,6 +1057,11 @@ static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn, switch (event) { case TN_EVENT_TX_OK: + if (unlikely(tn->msg_type == KFILND_MSG_BULK_PUT_REQ) && + CFS_FAIL_CHECK_RESET(CFS_KFI_FAIL_WAIT_SEND_COMP1, + CFS_KFI_FAIL_WAIT_SEND_COMP2 | + CFS_FAIL_ONCE)) + break; kfilnd_peer_alive(tn->tn_kp); kfilnd_tn_timeout_enable(tn); kfilnd_tn_state_change(tn, TN_STATE_WAIT_TAG_COMP); @@ -1064,6 +1069,16 @@ static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn, case TN_EVENT_TAG_RX_OK: kfilnd_tn_state_change(tn, TN_STATE_WAIT_SEND_COMP); + if (unlikely(tn->msg_type == KFILND_MSG_BULK_PUT_REQ) && + CFS_FAIL_CHECK(CFS_KFI_FAIL_WAIT_SEND_COMP2)) { + struct kfi_cq_err_entry fake_error = { + .op_context = tn, + .flags = KFI_MSG | KFI_SEND, + .err = EIO, + }; + + kfilnd_ep_gen_fake_err(tn->tn_ep, &fake_error); + } break; case TN_EVENT_TX_FAIL: @@ -1125,14 +1140,22 @@ static int kfilnd_tn_state_wait_send_comp(struct kfilnd_transaction *tn, KFILND_TN_DEBUG(tn, "%s event status %d", tn_event_to_str(event), status); - if (event == TN_EVENT_TX_OK) { + switch (event) { + case TN_EVENT_TX_OK: kfilnd_peer_alive(tn->tn_kp); - kfilnd_tn_finalize(tn, tn_released); - } else { + break; + case TN_EVENT_TX_FAIL: + kfilnd_tn_status_update(tn, status, + LNET_MSG_STATUS_NETWORK_TIMEOUT); + kfilnd_peer_tn_failed(tn->tn_kp, status); + break; + default: KFILND_TN_ERROR(tn, "Invalid %s event", tn_event_to_str(event)); LBUG(); } + kfilnd_tn_finalize(tn, tn_released); + return 0; } diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index 37db599..b634154 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -3623,6 +3623,26 @@ test_310() { } run_test 310 "Set timeout and verify" +test_311() { + [[ $NETTYPE == kfi* ]] || + skip "Need kfi network type" + + setupall || error "setupall failed" + + mkdir -p $DIR/$tdir || error "mkdir failed" + dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=1 oflag=direct || + error "dd write failed" + + local list=$(comma_list $(osts_nodes)) + +#define CFS_KFI_FAIL_WAIT_SEND_COMP 0xF115 + do_nodes $list $LCTL set_param fail_loc=0x8000F115 + dd if=$DIR/$tdir/$tfile of=/dev/null bs=1M count=1 || + error "dd read failed" + + cleanupall || error "Failed cleanup" +} +run_test 311 "Fail bulk put in send wait completion" check_udsp_prio() { local target_net="${1}"