From bc6a7c36cab621de7f2a1522dc986923cb29c18e Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Mon, 23 Aug 2021 10:29:18 +0300 Subject: [PATCH] LU-14956 fld: repeat failed FLDB lookup it's possible that LWP reconnection is in progress after remote MDS restart. if FLDB misses an entry, then FLDB lookup can fail with EAGAIN and whole RPC processing (like MDS_REINT) can fail as well. try to lookup few times in cases of EAGAIN. Signed-off-by: Alex Zhuravlev Change-Id: Ib6aeaf7706a6465b0c8bee696d985bb440ed192e Reviewed-on: https://review.whamcloud.com/44723 Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Tested-by: jenkins Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- lustre/fld/fld_handler.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lustre/fld/fld_handler.c b/lustre/fld/fld_handler.c index c5fe552..0fbd690 100644 --- a/lustre/fld/fld_handler.c +++ b/lustre/fld/fld_handler.c @@ -263,6 +263,8 @@ int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, fld->lsf_name, seq, -ENOENT); RETURN(-ENOENT); } else { + int i; + if (!fld->lsf_control_exp) { CERROR("%s: lookup %#llx, but not connects to MDT0 yet: rc = %d.\n", fld->lsf_name, seq, -EIO); @@ -274,8 +276,13 @@ int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, * replication on all mdt servers. */ range->lsr_start = seq; - rc = fld_client_rpc(fld->lsf_control_exp, - range, FLD_QUERY, NULL); + for (i = 0; i < 5; i++) { + rc = fld_client_rpc(fld->lsf_control_exp, + range, FLD_QUERY, NULL); + if (rc != -EAGAIN) + break; + schedule_timeout_interruptible(cfs_time_seconds(1)); + } if (rc == 0) fld_cache_insert(fld->lsf_cache, range); } -- 1.8.3.1