From: Alexander Boyko Date: Wed, 3 Feb 2021 11:04:52 +0000 (-0500) Subject: LU-14397 ptlrpc: idle import vs lock enqueue race X-Git-Tag: 2.14.52~103 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e6af3c529021976e6df5b5e729d6a6197d27fe11 LU-14397 ptlrpc: idle import vs lock enqueue race There is a window after ptlrpc_check_import_is_idle() and setting LUSTRE_IMP_CONNECTING for lock enqueue. The lock get granted on OST and is returned to the client. Server's lock is destroyed on OST_DISCONNECT. Perform import counters check with setting LUSTRE_IMP_CONNECTING. A regression test_812c was added to sanity. HPE-bug-id: LUS-8705 Signed-off-by: Andriy Skulysh Signed-off-by: Alexander Boyko Change-Id: I85da18b29ca58f811ecde8ce72ba24373388947e Reviewed-on: https://review.whamcloud.com/41403 Tested-by: jenkins Reviewed-by: Alex Zhuravlev Reviewed-by: Andriy Skulysh Tested-by: Maloo Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 48cb678..2b3902a 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -455,6 +455,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_PTLRPC_ROUND_XID 0x530 #define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531 #define OBD_FAIL_NET_ERROR_RPC 0x532 +#define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index 6485eea..3d0cf0f 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -1043,6 +1043,11 @@ enqueue_base: if (osc_lock_is_lockless(oscl)) { oio->oi_lockless = 1; } else if (!async) { + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) { + OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1) / 2); + } LASSERT(oscl->ols_state == OLS_GRANTED); LASSERT(oscl->ols_hold); LASSERT(oscl->ols_dlmlock != NULL); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index d2e45fa..969199b 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1695,7 +1695,6 @@ static struct ptlrpc_request *ptlrpc_disconnect_prep_req(struct obd_import *imp) req->rq_timeout = min_t(timeout_t, req->rq_timeout, INITIAL_CONNECT_TIMEOUT); - import_set_state(imp, LUSTRE_IMP_CONNECTING); req->rq_send_state = LUSTRE_IMP_CONNECTING; ptlrpc_request_set_replen(req); @@ -1744,14 +1743,18 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) rc = -EINTR; } + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + GOTO(set_state, rc = PTR_ERR(req)); + spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_FULL) + if (imp->imp_state != LUSTRE_IMP_FULL) { + ptlrpc_req_finished_with_imp_lock(req); GOTO(out, rc); + } + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); spin_unlock(&imp->imp_lock); - req = ptlrpc_disconnect_prep_req(imp); - if (IS_ERR(req)) - GOTO(set_state, rc = PTR_ERR(req)); rc = ptlrpc_queue_wait(req); ptlrpc_req_finished(req); @@ -1832,6 +1835,21 @@ static int ptlrpc_disconnect_idle_interpret(const struct lu_env *env, return 0; } +static bool ptlrpc_can_idle(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + + /* one request for disconnect rpc */ + if (atomic_read(&imp->imp_reqs) > 1) + return false; + + /* any lock increases ns_bref being a resource holder */ + if (ns && atomic_read(&ns->ns_bref) > 0) + return false; + + return true; +} + int ptlrpc_disconnect_and_idle_import(struct obd_import *imp) { struct ptlrpc_request *req; @@ -1843,30 +1861,38 @@ int ptlrpc_disconnect_and_idle_import(struct obd_import *imp) if (ptlrpc_import_in_recovery(imp)) RETURN(0); + req = ptlrpc_disconnect_prep_req(imp); + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret; + + if (OBD_FAIL_PRECHECK(OBD_FAIL_PTLRPC_IDLE_RACE)) { + __u32 idx; + + server_name2index(imp->imp_obd->obd_name, &idx, NULL); + if (idx == 0) + OBD_RACE(OBD_FAIL_PTLRPC_IDLE_RACE); + } + spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_FULL) { + if (imp->imp_state != LUSTRE_IMP_FULL || !ptlrpc_can_idle(imp)) { + ptlrpc_req_finished_with_imp_lock(req); spin_unlock(&imp->imp_lock); RETURN(0); } + import_set_state_nolock(imp, LUSTRE_IMP_CONNECTING); + /* don't make noise at reconnection */ + imp->imp_was_idle = 1; spin_unlock(&imp->imp_lock); - req = ptlrpc_disconnect_prep_req(imp); - if (IS_ERR(req)) - RETURN(PTR_ERR(req)); - CDEBUG_LIMIT(imp->imp_idle_debug, "%s: disconnect after %llus idle\n", imp->imp_obd->obd_name, ktime_get_real_seconds() - imp->imp_last_reply_time); - /* don't make noise at reconnection */ - spin_lock(&imp->imp_lock); - imp->imp_was_idle = 1; - spin_unlock(&imp->imp_lock); - - req->rq_interpret_reply = ptlrpc_disconnect_idle_interpret; ptlrpcd_add_req(req); - RETURN(0); + RETURN(1); } EXPORT_SYMBOL(ptlrpc_disconnect_and_idle_import); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 3b1edc4..9bcfebd 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -134,8 +134,9 @@ static int ptlrpc_ping(struct obd_import *imp) ENTRY; - if (ptlrpc_check_import_is_idle(imp)) - RETURN(ptlrpc_disconnect_and_idle_import(imp)); + if (ptlrpc_check_import_is_idle(imp) && + ptlrpc_disconnect_and_idle_import(imp) == 1) + RETURN(0); req = ptlrpc_prep_ping(imp); if (!req) { diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index c4e305d..36c19bd 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -25601,6 +25601,28 @@ test_812b() { # LU-12378 } run_test 812b "do not drop no resend request for idle connect" +test_812c() { + local old + + old=$($LCTL get_param -n osc.*.idle_timeout | head -n 1) + + $LFS setstripe -c 1 -o 0 $DIR/$tfile + $LFS getstripe $DIR/$tfile + $LCTL set_param osc.*.idle_timeout=10 + stack_trap "$LCTL set_param osc.*.idle_timeout=$old" EXIT + # ensure ost1 is connected + stat $DIR/$tfile >/dev/null || error "can't stat" + wait_osc_import_state client ost1 FULL + # no locks, no reqs to let the connection idle + cancel_lru_locks osc + +#define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 + $LCTL set_param fail_loc=0x80000533 + sleep 15 + dd if=/dev/zero of=$DIR/$tfile count=1 conv=sync || error "dd failed" +} +run_test 812c "idle import vs lock enqueue race" + test_813() { local file_heat_sav=$($LCTL get_param -n llite.*.file_heat 2>/dev/null) [ -z "$file_heat_sav" ] && skip "no file heat support"