Whamcloud - gitweb
LU-14876 out: don't connect to busy MDS-MDS export 62/44362/6
authorMikhail Pershin <mpershin@whamcloud.com>
Wed, 21 Jul 2021 15:14:01 +0000 (18:14 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 13 Sep 2021 20:09:11 +0000 (20:09 +0000)
MDS-MDS connection is missing check for busy requests upon
reconnect, so resent can be executed concurrently with
original request.

- in ptlrpc_server_check_resend_in_progress() remove exception
  for bulk requests, they can be compared by XID nowadays.
  This prevents OUT requests vs resent execution as well.
- fix messages in target_handle_connect() to report correct
  information about connection details
- in out_handle() check for last_xid only once per OUT_UPDATE
- test 110m is added to recovery-small to reproduce the issue

Lustre-change: https://review.whamcloud.com/44390
Lustre-commit: 301d76a71176c186129231ddd1323bae21100165

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I2ad183674d59a2cdeab0037bd8551c607b10ffeb
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44362
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/service.c
lustre/target/out_handler.c
lustre/tests/recovery-small.sh

index b5aed43..d6d2f48 100644 (file)
@@ -450,6 +450,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB3  0x520
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
+#define OBD_FAIL_PTLRPC_RESEND_RACE     0x525
 #define OBD_FAIL_PTLRPC_CONNECT_RACE    0x531
 
 #define OBD_FAIL_OBD_PING_NET            0x600
index 222457c..d9d7335 100644 (file)
@@ -1097,14 +1097,14 @@ int target_handle_connect(struct ptlrpc_request *req)
 
        /* Note: lw_client is needed in MDS-MDS failover during update log
         * processing, so we needs to allow lw_client to be connected at
-        * anytime, instead of only the initial connection */
-       lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0;
+        * anytime, instead of only the initial connection
+        */
+       lw_client = OCD_HAS_FLAG(data, LIGHTWEIGHT);
 
        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) {
                initial_conn = true;
-               mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0;
-               mds_mds_conn = (data->ocd_connect_flags &
-                               OBD_CONNECT_MDS_MDS) != 0;
+               mds_conn = OCD_HAS_FLAG(data, MDS);
+               mds_mds_conn = OCD_HAS_FLAG(data, MDS_MDS);
 
                /* OBD_CONNECT_MNE_SWAB is defined as OBD_CONNECT_MDS_MDS
                 * for Imperative Recovery connection from MGC to MGS.
@@ -1161,27 +1161,29 @@ int target_handle_connect(struct ptlrpc_request *req)
                export = NULL;
                rc = -EALREADY;
        } else if ((mds_conn || (lw_client && initial_conn) ||
-                  data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
-                  export->exp_connection != NULL) {
+                  OCD_HAS_FLAG(data, MDS_MDS)) && export->exp_connection) {
                spin_unlock(&export->exp_lock);
                if (req->rq_peer.nid != export->exp_connection->c_peer.nid) {
                        /* MDS or LWP reconnected after failover. */
-                       LCONSOLE_WARN("%s: Received %s connection from "
-                           "%s, removing former export from %s\n",
-                           target->obd_name, mds_conn ? "MDS" : "LWP",
-                           libcfs_nid2str(req->rq_peer.nid),
-                           libcfs_nid2str(export->exp_connection->c_peer.nid));
+                       LCONSOLE_WARN("%s: Received %s connection from %s, removing former export from %s\n",
+                                     target->obd_name,
+                                     lw_client ? "LWP" : "MDS",
+                                     libcfs_nid2str(req->rq_peer.nid),
+                                     libcfs_nid2str(export->exp_connection->c_peer.nid));
                } else {
-                       /* New MDS connection from the same NID. */
-                       LCONSOLE_WARN("%s: Received new %s connection from "
-                               "%s, removing former export from same NID\n",
-                               target->obd_name, mds_conn ? "MDS" : "LWP",
-                               libcfs_nid2str(req->rq_peer.nid));
+                       /* New connection from the same NID. */
+                       LCONSOLE_WARN("%s: Received new %s connection from %s, %s former export from same NID\n",
+                                     target->obd_name,
+                                     lw_client ? "LWP" : "MDS",
+                                     libcfs_nid2str(req->rq_peer.nid),
+                                     OCD_HAS_FLAG(data, MDS_MDS) ?
+                                     "keep" : "remove");
                }
 
                if (req->rq_peer.nid == export->exp_connection->c_peer.nid &&
-                   data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-                       /* Because exports between MDTs will always be
+                   OCD_HAS_FLAG(data, MDS_MDS)) {
+                       /*
+                        * Because exports between MDTs will always be
                         * kept, let's do not fail such export if they
                         * come from the same NID, otherwise it might
                         * cause eviction between MDTs, which might
index eb4c63d..6373c36 100644 (file)
@@ -39,6 +39,7 @@
 #include <lu_object.h>
 #include <uapi/linux/lnet/lnet-types.h>
 #include "ptlrpc_internal.h"
+#include <linux/delay.h>
 
 /* The following are visible and mutable through /sys/module/ptlrpc */
 int test_req_buffer_pressure = 0;
@@ -1568,11 +1569,6 @@ ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
            (atomic_read(&req->rq_export->exp_rpc_count) == 0))
                return NULL;
 
-       /* bulk request are aborted upon reconnect, don't try to
-        * find a match */
-       if (req->rq_bulk_write || req->rq_bulk_read)
-               return NULL;
-
        /* This list should not be longer than max_requests in
         * flights on the client, so it is not all that long.
         * Also we only hit this codepath in case of a resent
@@ -1694,13 +1690,20 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
        hp = rc > 0;
        ptlrpc_nrs_req_initialize(svcpt, req, hp);
 
-       if (req->rq_export != NULL) {
+       while (req->rq_export != NULL) {
                struct obd_export *exp = req->rq_export;
 
                /* do search for duplicated xid and the adding to the list
                 * atomically */
                spin_lock_bh(&exp->exp_rpc_lock);
                orig = ptlrpc_server_check_resend_in_progress(req);
+               if (orig && OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_RESEND_RACE)) {
+                       spin_unlock_bh(&exp->exp_rpc_lock);
+
+                       OBD_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+                       msleep(4 * MSEC_PER_SEC);
+                       continue;
+               }
                if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
                        bool linked;
 
@@ -1731,6 +1734,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
                else
                        list_add(&req->rq_exp_list, &exp->exp_reg_rpcs);
                spin_unlock_bh(&exp->exp_rpc_lock);
+               break;
        }
 
        /* the current thread is not the processing thread for this request
index 7dc9d42..a238f58 100644 (file)
@@ -52,7 +52,7 @@ static void out_reconstruct(const struct lu_env *env, struct dt_device *dt,
                            struct object_update_reply *reply,
                            int index)
 {
-       CDEBUG(D_INFO, "%s: fork reply reply %p index %d: rc = %d\n",
+       CDEBUG(D_HA, "%s: fork reply reply %p index %d: rc = %d\n",
               dt_obd_name(dt), reply, index, 0);
 
        object_update_result_insert(reply, NULL, 0, index, 0);
@@ -65,16 +65,10 @@ typedef void (*out_reconstruct_t)(const struct lu_env *env,
                                  struct object_update_reply *reply,
                                  int index);
 
-static inline int out_check_resent(const struct lu_env *env,
-                                  struct dt_device *dt,
-                                  struct dt_object *obj,
-                                  struct ptlrpc_request *req,
-                                  out_reconstruct_t reconstruct,
-                                  struct object_update_reply *reply,
-                                  int index)
+static inline bool out_check_resent(struct ptlrpc_request *req)
 {
        if (likely(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)))
-               return 0;
+               return false;
 
        if (req_xid_is_last(req)) {
                struct lsd_client_data *lcd;
@@ -90,14 +84,12 @@ static inline int out_check_resent(const struct lu_env *env,
                lustre_msg_set_transno(req->rq_repmsg, req->rq_transno);
                lustre_msg_set_status(req->rq_repmsg, req->rq_status);
 
-               DEBUG_REQ(D_RPCTRACE, req, "restoring resent RPC");
-
-               reconstruct(env, dt, obj, reply, index);
-               return 1;
+               DEBUG_REQ(D_HA, req, "reconstruct resent RPC");
+               return true;
        }
-       DEBUG_REQ(D_HA, req, "no reply for RESENT req (have %lld)",
-                req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
-       return 0;
+       DEBUG_REQ(D_HA, req, "reprocess RESENT req, last_xid is %lld",
+                 req->rq_export->exp_target_data.ted_lcd->lcd_last_xid);
+       return false;
 }
 
 static int out_create(struct tgt_session_info *tsi)
@@ -971,6 +963,8 @@ int out_handle(struct tgt_session_info *tsi)
        int                             rc1 = 0;
        int                             ouh_size, reply_size;
        int                             updates;
+       bool need_reconstruct;
+
        ENTRY;
 
        req_capsule_set(pill, &RQF_OUT_UPDATE);
@@ -1108,6 +1102,8 @@ int out_handle(struct tgt_session_info *tsi)
        tti->tti_u.update.tti_update_reply = reply;
        tti->tti_mult_trans = !req_is_replay(tgt_ses_req(tsi));
 
+       need_reconstruct = out_check_resent(pill->rc_req);
+
        /* Walk through updates in the request to execute them */
        for (i = 0; i < update_buf_count; i++) {
                struct tgt_handler      *h;
@@ -1155,12 +1151,19 @@ int out_handle(struct tgt_session_info *tsi)
 
                        /* Check resend case only for modifying RPC */
                        if (h->th_flags & MUTABOR) {
-                               struct ptlrpc_request *req = tgt_ses_req(tsi);
+                               /* sanity check for last XID changing */
+                               if (unlikely(!need_reconstruct &&
+                                            req_xid_is_last(pill->rc_req))) {
+                                       DEBUG_REQ(D_ERROR, pill->rc_req,
+                                                 "unexpected last XID change");
+                                       GOTO(next, rc = -EINVAL);
+                               }
 
-                               if (out_check_resent(env, dt, dt_obj, req,
-                                                    out_reconstruct, reply,
-                                                    reply_index))
+                               if (need_reconstruct) {
+                                       out_reconstruct(env, dt, dt_obj, reply,
+                                                       reply_index);
                                        GOTO(next, rc = 0);
+                               }
 
                                if (dt->dd_rdonly)
                                        GOTO(next, rc = -EROFS);
@@ -1169,6 +1172,10 @@ int out_handle(struct tgt_session_info *tsi)
                        /* start transaction for modification RPC only */
                        if (h->th_flags & MUTABOR && current_batchid == -1) {
                                current_batchid = update->ou_batchid;
+
+                               if (reply_index == 0)
+                                       CFS_RACE(OBD_FAIL_PTLRPC_RESEND_RACE);
+
                                rc = out_tx_start(env, dt, ta, tsi->tsi_exp);
                                if (rc != 0)
                                        GOTO(next, rc);
index d9f60e9..d7e8d0c 100755 (executable)
@@ -2260,6 +2260,38 @@ test_110k() {
 }
 run_test 110k "FID_QUERY failed during recovery"
 
+test_110m () {
+       (( $(lustre_version_code $SINGLEMDS) >= $(version_code 2.12.7) )) ||
+               skip "Need MDS version at least 2.12.7"
+       (( $MDSCOUNT >= 2 )) || skip "needs at least 2 MDTs"
+       local remote_dir=$DIR/$tdir/remote_dir
+       local mdccli
+       local uuid
+       local diridx
+
+       mkdir -p $DIR/$tdir
+
+#define OBD_FAIL_PTLRPC_RESEND_RACE 0x0525
+       do_facet mds1 $LCTL set_param fail_loc=0x80000525
+       $LFS mkdir -i 1 -c2 $remote_dir &
+       mkdir_pid=$!
+       sleep 3
+       # initiate the re-connect & re-send
+       mdccli=$(do_facet mds2 $LCTL dl |
+               awk '/MDT0000-osp-MDT0001/ {print $4;}')
+       uuid=$(do_facet mds2 $LCTL get_param -n osp.$mdccli.mdt_conn_uuid)
+       echo "conn_uuid=$uuid"
+       do_facet mds2 $LCTL set_param "osp.$mdccli.import=connection=$uuid"
+
+       wait $mkdir_pid
+       (( $? == 0 )) || error "mkdir failed"
+       lctl dk > ../../lustre.log
+       diridx=$($LFS getstripe -m $remote_dir)
+       (( $diridx == 1 )) || error "$diridx != 1"
+       rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 110m "update resent vs original RPC race"
+
 # LU-2844 mdt prepare fail should not cause umount oops
 test_111 ()
 {