From 0dda74eec7a29c98c7b6ee9a99e54c7dbefcabca Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Thu, 19 Mar 2020 15:24:28 -0400 Subject: [PATCH] LU-13131 osc: Do not wait for grants for too long obd_timeout is way too long considering we are holding a lock that might be contended. If OST is slow to respond, we might get evicted, so limit us to a half of the shortest possible max wait a server might have before switching to synchronous IO. Lustre-change: https://review.whamcloud.com/38283 Lustre-commit: 1eee11c75ca13745d083410e1ced3a1a8b088ee9 Change-Id: I36653194c1b8b95ba3cc2ed9240df7b0888cf7ed Signed-off-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/38672 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lustre/include/lustre_dlm.h | 2 ++ lustre/ldlm/ldlm_request.c | 1 + lustre/osc/osc_cache.c | 18 ++++++++++++++---- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 1babb16..559c374 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -1726,6 +1726,8 @@ int ldlm_cli_dropbits_list(struct list_head *converts, __u64 drop_bits); /** @} ldlm_cli_api */ +extern unsigned int ldlm_enqueue_min; + /* mds/handler.c */ /* This has to be here because recursive inclusion sucks. */ int intent_disposition(struct ldlm_reply *rep, int flag); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index f86a08a..0a03534 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -67,6 +67,7 @@ unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; module_param(ldlm_enqueue_min, uint, 0644); MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); +EXPORT_SYMBOL(ldlm_enqueue_min); /* in client side, whether the cached locks will be canceled before replay */ unsigned int ldlm_cancel_unused_locks_before_replay = 1; diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 8026d51..1a607c7 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -38,6 +38,7 @@ #define DEBUG_SUBSYSTEM S_OSC #include +#include #include "osc_internal.h" @@ -1591,12 +1592,21 @@ static inline void cli_lock_after_unplug(struct client_obd *cli) static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, struct osc_async_page *oap, int bytes) { - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - int rc = -EDQUOT; - unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout : at_max); + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + int rc = -EDQUOT; int remain; bool entered = false; + /* We cannot wait for a long time here since we are holding ldlm lock + * across the actual IO. If no requests complete fast (e.g. due to + * overloaded OST that takes a long time to process everything, we'd + * get evicted if we wait for a normal obd_timeout or some such. + * So we try to wait half the time it would take the client to be + * evicted by server which is half obd_timeout when AT is off + * or at least ldlm_enqueue_min with AT on. + * See LU-13131 */ + unsigned long timeout = cfs_time_seconds(AT_OFF ? obd_timeout / 2 : + ldlm_enqueue_min / 2); ENTRY; -- 1.8.3.1