From 0661102dea52e7f21636b0ed808ebf037b946954 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Wed, 13 Mar 2024 10:46:12 -0400 Subject: [PATCH] LU-13802 ptlrpc: correctly remove inflight request When removing a request from the active set on error, we must also remove it from "inflight" or we will not reduce inflight as needed and hang on cleanup. This bug has been latent for some time, but running sanity 414 with hybrid IO tends to trigger it. Signed-off-by: Patrick Farrell Change-Id: Ib73980724f6e2f5a74400a39840df2e8835a6e23 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54099 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Shaun Tancheff Reviewed-by: Oleg Drokin Reviewed-by: Alex Zhuravlev --- lustre/ptlrpc/client.c | 5 ++++- lustre/tests/sanity.sh | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index fcfd417..a3c593e 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2127,8 +2127,11 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) rc = ptl_send_rpc(req, 0); if (rc == -ENOMEM) { spin_lock(&imp->imp_lock); - if (!list_empty(&req->rq_list)) + if (!list_empty(&req->rq_list)) { list_del_init(&req->rq_list); + if (atomic_dec_and_test(&imp->imp_inflight)) + wake_up(&imp->imp_recovery_waitq); + } spin_unlock(&imp->imp_lock); ptlrpc_rqphase_move(req, RQ_PHASE_NEW); continue; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index ae3b3c4..e5a43f5 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -29609,6 +29609,11 @@ test_414() { $LCTL set_param fail_loc=0x80000521 dd if=/dev/zero of=$DIR/$tfile bs=2M count=1 oflag=sync rm -f $DIR/$tfile + # This error path has sometimes left inflight requests dangling, so + # test for this by remounting the client (umount will hang if there's + # a dangling request) + umount_client $MOUNT + mount_client $MOUNT } run_test 414 "simulate ENOMEM in ptlrpc_register_bulk()" -- 1.8.3.1