From e6225c07ce4c0037a127a41b2bc539364dfd1f4d Mon Sep 17 00:00:00 2001 From: Ann Koehler Date: Mon, 14 Oct 2019 11:30:56 -0500 Subject: [PATCH] LU-12816 ptlrpc: ptlrpc_register_bulk LBUG on ENOMEM Another path through ptl_send_rpc() can cause the assert reported in LU-10643. The assertion in ptlrpc_register_bulk() on !desc->bd_registered fails when an rpc is resent and the first send attempt failed to successfully attach the reply buffer. The bulk error cleanup in ptl_send_rpc() does not reset the bd_registered flag. Cray-bug-id: LUS-7946 Signed-off-by: Ann Koehler Change-Id: I474211f196ea9bd83a036747e25c91c37c85ffbb Reviewed-on: https://review.whamcloud.com/36309 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Shaun Tancheff Reviewed-by: Chris Horn Reviewed-by: Oleg Drokin --- lustre/ptlrpc/niobuf.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index eaf6bd0..4720592 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -908,18 +908,20 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) GOTO(out, rc); cleanup_me: - /* MEUnlink is safe; the PUT didn't even get off the ground, and - * nobody apart from the PUT's target has the right nid+XID to - * access the reply buffer. */ - rc2 = LNetMEUnlink(reply_me_h); - LASSERT (rc2 == 0); - /* UNLINKED callback called synchronously */ - LASSERT(!request->rq_receiving_reply); + /* MEUnlink is safe; the PUT didn't even get off the ground, and + * nobody apart from the PUT's target has the right nid+XID to + * access the reply buffer. */ + rc2 = LNetMEUnlink(reply_me_h); + LASSERT (rc2 == 0); + /* UNLINKED callback called synchronously */ + LASSERT(!request->rq_receiving_reply); cleanup_bulk: - /* We do sync unlink here as there was no real transfer here so - * the chance to have long unlink to sluggish net is smaller here. */ + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ ptlrpc_unregister_bulk(request, 0); + if (request->rq_bulk != NULL) + request->rq_bulk->bd_registered = 0; out: if (rc == -ENOMEM) { /* set rq_sent so that this request is treated -- 1.8.3.1