From 1eee11c75ca13745d083410e1ced3a1a8b088ee9 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Mon, 20 Apr 2020 09:51:29 -0400 Subject: [PATCH] LU-13131 osc: Do not wait for grants for too long obd_timeout is way too long considering we are holding a lock that might be contended. If OST is slow to respond, we might get evicted, so limit us to a half of the shortest possible max wait a server might have before switching to synchronous IO. Change-Id: I36653194c1b8b95ba3cc2ed9240df7b0888cf7ed Signed-off-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/38283 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Bobi Jam --- lustre/include/lustre_dlm.h | 2 ++ lustre/ldlm/ldlm_request.c | 1 + lustre/osc/osc_cache.c | 18 ++++++++++++++---- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 07970ea..a1bda70 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -1728,6 +1728,8 @@ int ldlm_cli_inodebits_convert(struct ldlm_lock *lock, /** @} ldlm_cli_api */ +extern unsigned int ldlm_enqueue_min; + /* mds/handler.c */ /* This has to be here because recursive inclusion sucks. */ int intent_disposition(struct ldlm_reply *rep, int flag); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index a8084b1..6a570c3 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -67,6 +67,7 @@ unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; module_param(ldlm_enqueue_min, uint, 0644); MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); +EXPORT_SYMBOL(ldlm_enqueue_min); /* in client side, whether the cached locks will be canceled before replay */ unsigned int ldlm_cancel_unused_locks_before_replay = 1; diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 327f938..93cd2d6 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -38,6 +38,7 @@ #define DEBUG_SUBSYSTEM S_OSC #include +#include #include "osc_internal.h" @@ -1581,12 +1582,21 @@ static inline void cli_lock_after_unplug(struct client_obd *cli) static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *oap, int bytes) { - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - int rc = -EDQUOT; - unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max); + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; int remain; bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); ENTRY; -- 1.8.3.1