Whamcloud - gitweb
LU-16451 kfilnd: Improve CQ error logging 89/49589/2
authorChris Horn <chris.horn@hpe.com>
Tue, 1 Nov 2022 19:39:39 +0000 (13:39 -0600)
committerOleg Drokin <green@whamcloud.com>
Fri, 27 Jan 2023 00:35:45 +0000 (00:35 +0000)
Improve CQ error logging for send events by printing the errno from
the CQ event as well as the provider error. This should allow us to
better root cause TN failures.

Also remove an extra newline character.

HPE-bug-id: LUS-11314
Test-Parameters: trivial
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I79bbe0312a9124dd34285d43b6e83f9d897923c1
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49589
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Ron Gredvig <ron.gredvig@hpe.com>
Reviewed-by: Ian Ziemba <ian.ziemba@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/kfilnd/kfilnd_cq.c

index d070afe..5b5678e 100644 (file)
@@ -48,7 +48,7 @@ void kfilnd_cq_process_error(struct kfilnd_ep *ep,
        switch (error->flags) {
        case KFI_MSG | KFI_RECV:
                if (error->err != ECANCELED) {
-                       KFILND_EP_ERROR(ep, "Dropping error receive event %d\n",
+                       KFILND_EP_ERROR(ep, "Dropping error receive event %d",
                                        -error->err);
                        return;
                }
@@ -76,6 +76,10 @@ void kfilnd_cq_process_error(struct kfilnd_ep *ep,
                tn = error->op_context;
                tn_event = TN_EVENT_TX_FAIL;
                status = -error->err;
+               KFILND_EP_ERROR(ep,
+                               "msg send error %d prov error %d flags %llx",
+                               status, -error->prov_errno, error->flags);
+
                break;
 
        case KFI_TAGGED | KFI_SEND:
@@ -84,6 +88,9 @@ void kfilnd_cq_process_error(struct kfilnd_ep *ep,
                tn = error->op_context;
                tn_event = TN_EVENT_TAG_TX_FAIL;
                status = -error->err;
+               KFILND_EP_ERROR(ep,
+                               "tagged error %d prov error %d flags %llx",
+                               status, -error->prov_errno, error->flags);
                break;
 
        default: