Whamcloud - gitweb
LU-19076 ptlrpc: resend can hit original req 97/59497/12
authorAlex Zhuravlev <bzzz@whamcloud.com>
Fri, 30 May 2025 15:45:41 +0000 (18:45 +0300)
committerOleg Drokin <green@whamcloud.com>
Thu, 12 Jun 2025 06:37:12 +0000 (06:37 +0000)
the client may need to resend a request if the reply buffer can
not fit the reply (LOVEA has just changed, for example).
in some environment (e.g. server and client share same node),
a resend RPC can find the original RPC on export's list and the
server just drops the resend RPC thinking it's a duplicate.
this way the client gets no reply for the resend RPC and times
out.

if this problem happens during layout refresh where the client
holds layout lock requesting LOVEA with MDS_GETXATTR, then
the server can evict the client.

the patch removes RPC from export's list just before sending a
reply as RPC has been already processed and for non-idempotent
request reconstruction should take place.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I48437ad018b9b43b9fff4157203906fd84b6cfd3
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59497
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_net.h
lustre/include/obd_support.h
lustre/mdt/mdt_handler.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/service.c
lustre/tests/replay-single.sh

index d4fd6c8..f871031 100644 (file)
@@ -1037,7 +1037,8 @@ struct ptlrpc_request {
                /* bulk request, sent to server, but uncommitted */
                rq_unstable:1,
                rq_early_free_repbuf:1, /* free reply buffer in advance */
-               rq_allow_intr:1;
+               rq_allow_intr:1,
+               rq_pause_after_reply:1;
        /** @} */
 
        /** server-side flags are serialized by rq_lock @{ */
@@ -2259,6 +2260,7 @@ struct ptlrpc_service *ptlrpc_register_service(
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
 int ptlrpc_service_health_check(struct ptlrpc_service *service);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_del_exp_list(struct ptlrpc_request *req);
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
                                  struct obd_export *export);
 void ptlrpc_update_export_timer(struct ptlrpc_request *req);
index 863fe99..aad5eea 100644 (file)
@@ -774,6 +774,7 @@ extern bool obd_enable_fname_encoding;
 /* continuation of MDS related constants */
 #define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401
 #define OBD_FAIL_MDS_CONNECT_ACCESS            0x2402
+#define OBD_FAIL_MDS_PAUSE_GETATTR             0x2403
 
 /* PLEASE, KEEP NUMBERS UP TO 0x3000 RESERVED FOR OBD_FAIL_MDS_* */
 
index 3bcb56b..67fa666 100644 (file)
@@ -2108,6 +2108,9 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info,
        if (info->mti_mdt->mdt_enable_dir_auto_split)
                ma_need |= MA_DIRENT_CNT;
 
+       if (CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_PAUSE_GETATTR, cfs_fail_val))
+               req->rq_pause_after_reply = 1;
+
        if (info->mti_cross_ref) {
                /* Only getattr on the child. Parent is on another node. */
                mdt_set_disposition(info, ldlm_rep,
index 375d44c..a321cce 100644 (file)
@@ -735,6 +735,12 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
        if (unlikely(rc))
                goto out;
 
+       /*
+        * remove from the export list so quick
+        * resend won't find the original one.
+        */
+       ptlrpc_del_exp_list(req);
+
        req->rq_sent = ktime_get_real_seconds();
 
        rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
index fca54af..d2dae1b 100644 (file)
@@ -1023,9 +1023,14 @@ static void ptlrpc_add_exp_list_nolock(struct ptlrpc_request *req,
                set_bit(tag - 1, export->exp_used_slots);
 }
 
-static void ptlrpc_del_exp_list(struct ptlrpc_request *req)
+void ptlrpc_del_exp_list(struct ptlrpc_request *req)
 {
-       __u16 tag = lustre_msg_get_tag(req->rq_reqmsg);
+       __u16 tag = 0;
+
+       if (unlikely(!req->rq_export))
+               return;
+       if (likely(req->rq_reqmsg))
+               tag = lustre_msg_get_tag(req->rq_reqmsg);
 
        spin_lock(&req->rq_export->exp_rpc_lock);
        list_del_init(&req->rq_exp_list);
@@ -1623,6 +1628,7 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
                        lustre_msg_get_handle(reqcopy->rq_reqmsg));
        if (reqcopy->rq_export == NULL)
                GOTO(out, rc = -ENODEV);
+       INIT_LIST_HEAD(&reqcopy->rq_exp_list);
 
        /* RPC ref */
        class_export_rpc_inc(reqcopy->rq_export);
@@ -2554,6 +2560,11 @@ put_conn:
                          request->rq_early_count,
                          div_u64(arrived_usecs, USEC_PER_SEC));
        }
+       if (unlikely(request->rq_pause_after_reply)) {
+               DEBUG_REQ(D_WARNING, request, "pause req after reply");
+               schedule_timeout_uninterruptible(cfs_time_seconds(3));
+               DEBUG_REQ(D_WARNING, request, "continue");
+       }
 
        ptlrpc_server_finish_active_request(svcpt, request);
 
index c79fd22..4d92a82 100755 (executable)
@@ -5382,6 +5382,31 @@ test_202() {
 }
 run_test 202 "pfl replay should recovery layout generation"
 
+test_203() {
+       mount_client $MOUNT2
+       stack_trap "umount_client $MOUNT2"
+
+       local start=$SECONDS
+#define OBD_FAIL_MDS_PAUSE_GETATTR             0x2403
+       do_facet mds1 "$LCTL set_param fail_loc=0x80002403 fail_val=2"
+       echo "STAT"
+       stat $MOUNT/$tfile &
+       local PID=$!
+
+       sleep 0.5
+
+       echo "SETSTRIPE"
+       $LFS setstripe -E 1MB -c -1 -E 2MB -c -1 -E 3MB -c -1 -E 4MB -c -1 \
+               -E 5MB -c -1 -E 6MB -c -1 -E 7MB -c -1 -E 8MB -c -1 \
+               -E 9MB -c -1 -E 10MB -c -1 -E 11MB -c -1 -E eof -c -1 \
+               $MOUNT2/$tfile
+
+       wait $PID
+       do_facet mds1 "$LCTL set_param fail_loc=0 fail_val=0"
+       (( SECONDS-start < TIMEOUT/2 )) ||
+               error "took too long: $((SECONDS-start)) >= $((TIMEOUT/2))"
+}
+run_test 203 "resend can hit original request"
 
 complete_test $SECONDS
 check_and_cleanup_lustre