Whamcloud - gitweb
LU-14397 ptlrpc: idle import vs lock enqueue race 03/41403/6
authorAlexander Boyko <c17825@cray.com>
Wed, 3 Feb 2021 11:04:52 +0000 (06:04 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 28 Apr 2021 02:11:32 +0000 (02:11 +0000)
There is a window after ptlrpc_check_import_is_idle()
and setting LUSTRE_IMP_CONNECTING for lock enqueue.
The lock get granted on OST and is returned to the client.
Server's lock is destroyed on OST_DISCONNECT.

Perform import counters check with setting LUSTRE_IMP_CONNECTING.
A regression test_812c was added to sanity.

HPE-bug-id: LUS-8705
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I85da18b29ca58f811ecde8ce72ba24373388947e
Reviewed-on: https://review.whamcloud.com/41403
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Andriy Skulysh <askulysh@gmail.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osc/osc_lock.c
lustre/ptlrpc/import.c
lustre/ptlrpc/pinger.c
lustre/tests/sanity.sh

index 48cb678..2b3902a 100644 (file)
@@ -455,6 +455,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_PTLRPC_ROUND_XID       0x530
 #define OBD_FAIL_PTLRPC_CONNECT_RACE    0x531
 #define OBD_FAIL_NET_ERROR_RPC          0x532
+#define OBD_FAIL_PTLRPC_IDLE_RACE       0x533
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
index 6485eea..3d0cf0f 100644 (file)
@@ -1043,6 +1043,11 @@ enqueue_base:
                if (osc_lock_is_lockless(oscl)) {
                        oio->oi_lockless = 1;
                } else if (!async) {
+                       if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) {
+                               OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE);
+                               set_current_state(TASK_UNINTERRUPTIBLE);
+                               schedule_timeout(cfs_time_seconds(1) / 2);
+                       }
                        LASSERT(oscl->ols_state == OLS_GRANTED);
                        LASSERT(oscl->ols_hold);
                        LASSERT(oscl->ols_dlmlock != NULL);
index d2e45fa..969199b 100644 (file)
@@ -1695,7 +1695,6 @@ static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp)
        req->rq_timeout = min_t(timeout_t, req->rq_timeout,
                                INITIAL_CONNECT_TIMEOUT);
 
-       import_set_state(imp, LUSTRE_IMP_CONNECTING);
        req->rq_send_state =  LUSTRE_IMP_CONNECTING;
        ptlrpc_request_set_replen(req);
 
@@ -1744,14 +1743,18 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
                        rc = -EINTR;
        }
 
+       req = ptlrpc_disconnect_prep_req(imp);
+       if (IS_ERR(req))
+               GOTO(set_state, rc = PTR_ERR(req));
+
        spin_lock(&imp->imp_lock);
-       if (imp->imp_state != LUSTRE_IMP_FULL)
+       if (imp->imp_state != LUSTRE_IMP_FULL) {
+               ptlrpc_req_finished_with_imp_lock(req);
                GOTO(out, rc);
+       }
+       import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
        spin_unlock(&imp->imp_lock);
 
-       req = ptlrpc_disconnect_prep_req(imp);
-       if (IS_ERR(req))
-               GOTO(set_state, rc = PTR_ERR(req));
        rc = ptlrpc_queue_wait(req);
        ptlrpc_req_finished(req);
 
@@ -1832,6 +1835,21 @@ static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env,
        return 0;
 }
 
+static bool ptlrpc_can_idle(struct obd_import *imp)
+{
+       struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+
+       /* one request for disconnect rpc */
+       if (atomic_read(&imp->imp_reqs) > 1)
+               return false;
+
+       /* any lock increases ns_bref being a resource holder */
+       if (ns && atomic_read(&ns->ns_bref) > 0)
+               return false;
+
+       return true;
+}
+
 int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
 {
        struct ptlrpc_request *req;
@@ -1843,30 +1861,38 @@ int ptlrpc_disconnect_and_idle_import(struct obd_import *imp)
        if (ptlrpc_import_in_recovery(imp))
                RETURN(0);
 
+       req = ptlrpc_disconnect_prep_req(imp);
+       if (IS_ERR(req))
+               RETURN(PTR_ERR(req));
+
+       req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
+
+       if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) {
+               __u32 idx;
+
+               server_name2index(imp->imp_obd->obd_name, &idx, NULL);
+               if (idx == 0)
+                       OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE);
+       }
+
        spin_lock(&imp->imp_lock);
-       if (imp->imp_state != LUSTRE_IMP_FULL) {
+       if (imp->imp_state != LUSTRE_IMP_FULL || !ptlrpc_can_idle(imp)) {
+               ptlrpc_req_finished_with_imp_lock(req);
                spin_unlock(&imp->imp_lock);
                RETURN(0);
        }
+       import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING);
+       /* don't make noise at reconnection */
+       imp->imp_was_idle = 1;
        spin_unlock(&imp->imp_lock);
 
-       req = ptlrpc_disconnect_prep_req(imp);
-       if (IS_ERR(req))
-               RETURN(PTR_ERR(req));
-
        CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n",
                     imp->imp_obd->obd_name,
                     ktime_get_real_seconds() - imp->imp_last_reply_time);
 
-       /* don't make noise at reconnection */
-       spin_lock(&imp->imp_lock);
-       imp->imp_was_idle = 1;
-       spin_unlock(&imp->imp_lock);
-
-       req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret;
        ptlrpcd_add_req(req);
 
-       RETURN(0);
+       RETURN(1);
 }
 EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import);
 
index 3b1edc4..9bcfebd 100644 (file)
@@ -134,8 +134,9 @@ static int ptlrpc_ping(struct obd_import *imp)
 
        ENTRY;
 
-       if (ptlrpc_check_import_is_idle(imp))
-               RETURN(ptlrpc_disconnect_and_idle_import(imp));
+       if (ptlrpc_check_import_is_idle(imp) &&
+           ptlrpc_disconnect_and_idle_import(imp) == 1)
+                       RETURN(0);
 
        req = ptlrpc_prep_ping(imp);
        if (!req) {
index c4e305d..36c19bd 100755 (executable)
@@ -25601,6 +25601,28 @@ test_812b() { # LU-12378
 }
 run_test 812b "do not drop no resend request for idle connect"
 
+test_812c() {
+       local old
+
+       old=$($LCTL get_param -n osc.*.idle_timeout | head -n 1)
+
+       $LFS setstripe -c 1 -o 0 $DIR/$tfile
+       $LFS getstripe $DIR/$tfile
+       $LCTL set_param osc.*.idle_timeout=10
+       stack_trap "$LCTL set_param osc.*.idle_timeout=$old" EXIT
+       # ensure ost1 is connected
+       stat $DIR/$tfile >/dev/null || error "can't stat"
+       wait_osc_import_state client ost1 FULL
+       # no locks, no reqs to let the connection idle
+       cancel_lru_locks osc
+
+#define OBD_FAIL_PTLRPC_IDLE_RACE       0x533
+       $LCTL set_param fail_loc=0x80000533
+       sleep 15
+       dd if=/dev/zero of=$DIR/$tfile count=1 conv=sync || error "dd failed"
+}
+run_test 812c "idle import vs lock enqueue race"
+
 test_813() {
        local file_heat_sav=$($LCTL get_param -n llite.*.file_heat 2>/dev/null)
        [ -z "$file_heat_sav" ] && skip "no file heat support"