LU-18878 ptlrpc: improve ping evictor and recovery timeout

author Vitaly Fertman <c17818@cray.com>

Mon, 31 Mar 2025 17:51:28 +0000 (20:51 +0300)

committer Oleg Drokin <green@whamcloud.com>

Wed, 16 Apr 2025 20:41:28 +0000 (20:41 +0000)
author Vitaly Fertman <c17818@cray.com>
Mon, 31 Mar 2025 17:51:28 +0000 (20:51 +0300)
committer Oleg Drokin <green@whamcloud.com>
Wed, 16 Apr 2025 20:41:28 +0000 (20:41 +0000)
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index a1ec2cf..bdb5485 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -1515,7 +1515,6 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
  
  void ldlm_revoke_export_locks(struct obd_export *exp);
  timeout_t ldlm_bl_timeout(struct ldlm_lock *lock);
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req);
  #endif
  int ldlm_del_waiting_lock(struct ldlm_lock *lock);
  int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h

index 844fc4f..afdc282 100644 (file)
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -196,7 +196,7 @@ struct obd_export {
          * order
          * protected by obd_dev_lock
          */
-       struct list_head        exp_obd_chain_timed;
+       struct list_head        exp_timed_chain;
         /** Obd device of this export */
         struct obd_device      *exp_obd;
         /**
@@ -223,6 +223,7 @@ struct obd_export {
         __u64                   exp_last_committed;
         /** When was last request received */
         time64_t                exp_last_request_time;
+       time64_t                exp_deadline;
         /** On replay all requests waiting for replay are linked here */
         struct list_head        exp_req_replay_queue;
         /**
@@ -260,7 +261,7 @@ struct obd_export {
                                  */
                                 exp_old_falloc:1,
                                 exp_hashed:1,
-                               exp_not_timed:1;
+                               exp_timed:1;
         /* also protected by exp_lock */
         enum lustre_sec_part    exp_sp_peer;
         struct sptlrpc_flavor   exp_flvr;               /* current */
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 5049c6b..c608124 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -2261,8 +2261,9 @@ int ptlrpc_service_health_check(struct ptlrpc_service *service);
  void ptlrpc_server_drop_request(struct ptlrpc_request *req);
  void ptlrpc_request_change_export(struct ptlrpc_request *req,
                                   struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp,
-                               time64_t extra_delay);
+void ptlrpc_update_export_timer(struct ptlrpc_request *req);
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+                                       bool recovery);
  
  int ptlrpc_hr_init(void);
  void ptlrpc_hr_fini(void);
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 5a82ce2..af7e508 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -683,7 +683,7 @@ struct obd_device {
         struct obd_export       *obd_self_export;
         struct obd_export       *obd_lwp_export;
         /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
-       struct list_head        obd_exports_timed;
+       struct rb_root          obd_exports_timed;
         time64_t                obd_eviction_timer;     /* for ping evictor */
  
         atomic_t                obd_max_recoverable_clients;
diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h

index 00b2448..f04f318 100644 (file)
--- a/lustre/include/obd_class.h
+++ b/lustre/include/obd_class.h
@@ -406,6 +406,12 @@ void class_import_put(struct obd_import *);
  struct obd_import *class_new_import(struct obd_device *obd);
  void class_destroy_import(struct obd_import *exp);
  
+int obd_export_timed_init(struct obd_export *exp, void **data);
+void obd_export_timed_fini(struct obd_export *exp, void **data);
+void obd_export_timed_add(struct obd_export *exp, void **data);
+void obd_export_timed_del(struct obd_export *exp);
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last);
+
  #ifdef HAVE_SERVER_SUPPORT
  struct obd_type *class_search_type(const char *name);
  struct obd_type *class_get_type(const char *name);
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 589f052..9e7a4bd 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -41,7 +41,6 @@ extern unsigned int obd_lbug_on_eviction;
  extern unsigned int obd_timeout;          /* seconds */
  extern unsigned int ldlm_timeout;         /* seconds */
  extern unsigned int ping_interval;        /* seconds */
-extern unsigned int ping_evict_timeout_multiplier;
  extern unsigned int obd_timeout_set;
  extern unsigned int ldlm_timeout_set;
  extern unsigned int bulk_timeout;
@@ -90,7 +89,7 @@ extern bool obd_enable_fname_encoding;
   * and there's no urgent need to evict a client just because it's idle, we
   * should be very conservative here.
   */
-#define PING_EVICT_TIMEOUT (PING_INTERVAL * ping_evict_timeout_multiplier)
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
  #define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
  #define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
   /* Max connect interval for nonresponsive servers; ~50s to avoid building up
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c

index ae061bd..04825c9 100644 (file)
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -770,7 +770,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
         /* OK. this is a possible lock the user holds doing I/O
          * let's refresh eviction timer for it.
          */
-       timeout = ldlm_bl_timeout_by_rpc(arg->lpa_req);
+       timeout = ptlrpc_export_prolong_timeout(arg->lpa_req, false);
         LDLM_DEBUG(lock, "refreshed to %ds. ", timeout);
         ldlm_refresh_waiting_lock(lock, timeout);
  }
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index c24d892..80f5699 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -980,6 +980,7 @@ int rev_import_init(struct obd_export *export)
  {
         struct obd_device *obd = export->exp_obd;
         struct obd_import *revimp;
+       int rc = 0;
  
         LASSERT(export->exp_imp_reverse == NULL);
  
@@ -997,13 +998,22 @@ int rev_import_init(struct obd_export *export)
         spin_unlock(&export->exp_lock);
         class_import_put(revimp);
  
-       if (!export->exp_not_timed) {
-               spin_lock(&obd->obd_dev_lock);
-               list_add_tail(&export->exp_obd_chain_timed,
-                             &obd->obd_exports_timed);
-               spin_unlock(&obd->obd_dev_lock);
+       if (export->exp_timed) {
+               void *data;
+
+               rc = obd_export_timed_init(export, &data);
+               if (rc == 0) {
+                       spin_lock(&obd->obd_dev_lock);
+                       /* At the beginning, there is no AT stats yet, use
+                        * previous approach for the ping evictor timeout */
+                       export->exp_deadline =
+                               PING_EVICT_TIMEOUT + ktime_get_real_seconds();
+                       obd_export_timed_add(export, &data);
+                       spin_unlock(&obd->obd_dev_lock);
+                       obd_export_timed_fini(export, &data);
+               }
         }
-       return 0;
+       return rc;
  }
  EXPORT_SYMBOL(rev_import_init);
  
@@ -1497,7 +1507,7 @@ dont_check_exports:
                          * should be called to cleanup stuff
                          */
                         spin_lock(&target->obd_dev_lock);
-                       list_del_init(&export->exp_obd_chain_timed);
+                       obd_export_timed_del(export);
                         spin_unlock(&target->obd_dev_lock);
  
                         class_export_get(export);
@@ -2430,29 +2440,8 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
                  * Add request @timeout to the recovery time so next request from
                  * this client may come in recovery time
                  */
-               if (!obd_at_off(obd)) {
-                       struct ptlrpc_service_part *svcpt;
-                       timeout_t est_timeout;
-
-                       svcpt = req->rq_rqbd->rqbd_svcpt;
-                       /*
-                        * If the server sent early reply for this request,
-                        * the client will recalculate the timeout according to
-                        * current server estimate service time, so we will
-                        * use the maxium timeout here for waiting the client
-                        * sending the next req
-                        */
-                       est_timeout = obd_at_get(obd, &svcpt->scp_at_estimate);
-                       timeout = max_t(timeout_t, at_est2timeout(est_timeout),
-                                       lustre_msg_get_timeout(req->rq_reqmsg));
-                       /*
-                        * Add 2 net_latency, one for balance rq_deadline
-                        * (see ptl_send_rpc), one for resend the req to server,
-                        * Note: client will pack net_latency in replay req
-                        * (see ptlrpc_replay_req)
-                        */
-                       timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
-               }
+               if (!obd_at_off(obd))
+                       timeout = ptlrpc_export_prolong_timeout(req, true);
                 extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
                                       true);
         }
@@ -2843,7 +2832,7 @@ static int target_recovery_thread(void *arg)
                  * so we need refresh the last_request_time, to avoid the
                  * export is being evicted
                  */
-               ptlrpc_update_export_timer(req->rq_export, 0);
+               ptlrpc_update_export_timer(req);
         }
  
         /*
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 9a292fb..3d3c1c3 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -648,59 +648,6 @@ timeout_t ldlm_bl_timeout(struct ldlm_lock *lock)
  EXPORT_SYMBOL(ldlm_bl_timeout);
  
  /**
- * Calculate the per-export Blocking timeout by the given RPC (covering the
- * reply to this RPC and the next RPC). The next RPC could be still not CANCEL,
- * but having the lock refresh mechanism it is enough.
- *
- * Used for lock refresh timeout when we are in the middle of the process -
- * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
- * and at least 1 RPC (which will trigger another refresh if it will be not
- * CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
- * from the RPC (i.e. the view of the client on the current AT) is taken into
- * account.
- *
- * \param[in] req     req which export needs the timeout calculation
- *
- * \retval            timeout in seconds to wait for the next client's RPC
- */
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req)
-{
-       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
-       timeout_t timeout, req_timeout, at_timeout, netl;
-       struct obd_device *obd = req->rq_export->exp_obd;
-
-       if (obd_at_off(obd))
-               return obd_timeout / 2;
-
-       /* A blocked lock means somebody in the cluster is waiting, and we
-        * should not consider the worst ever case, consisting of a chain of
-        * failures on each step, however this timeout should survive a
-        * recovery of at least 1 failure, let this one to be the worst one:
-        * in case a server NID is dead first re-connect is done through the
-        * same router and also times out.
-        *
-        * Either this on the next RPC times out, take the max.
-        * Considering the current RPC, take just the left time.
-        */
-       netl = obd_at_get(obd,
-                         &req->rq_export->exp_imp_reverse->imp_at.iat_net_latency);
-       req_timeout = req->rq_deadline - ktime_get_real_seconds() + netl;
-       at_timeout = at_est2timeout(obd_at_get(obd, &svcpt->scp_at_estimate))
-                                   + netl;
-       req_timeout = max(req_timeout, at_timeout);
-
-       /* Take 1 re-connect failure and 1 re-connect success into account. */
-       timeout = at_timeout + INITIAL_CONNECT_TIMEOUT + netl + req_timeout;
-
-       /* Client's timeout is calculated as at_est2timeout(), let's be a bit
-        * more conservative than client
-        */
-       return max(timeout + (timeout >> 4),
-                  (timeout_t)obd_get_ldlm_enqueue_min(obd));
-}
-EXPORT_SYMBOL(ldlm_bl_timeout_by_rpc);
-
-/**
   * Perform lock cleanup if AST sending failed.
   */
  static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c

index 1187f06..fa23671 100644 (file)
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -7183,14 +7183,16 @@ static int mdt_connect_internal(const struct lu_env *env,
                 return -EBADE;
         }
  
-       if (OCD_HAS_FLAG(data, PINGLESS)) {
-               if (ptlrpc_pinger_suppress_pings()) {
-                       spin_lock(&exp->exp_lock);
-                       exp->exp_not_timed = 1;
-                       spin_unlock(&exp->exp_lock);
-               } else {
-                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
-               }
+       if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+               data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+       /* Because we do not want this export to be evicted by pinger,
+        * let's not add this export to the timed chain list. */
+       if (!OCD_HAS_FLAG(data, PINGLESS) &&
+           !(data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_timed = 1;
+               spin_unlock(&exp->exp_lock);
         }
  
         data->ocd_max_easize = mdt->mdt_max_ea_size;
@@ -7476,14 +7478,6 @@ out:
                 *exp = NULL;
         } else {
                 *exp = lexp;
-               /* Because we do not want this export to be evicted by pinger,
-                * let's not add this export to the timed chain list.
-                */
-               if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-                       spin_lock(&lexp->exp_lock);
-                       lexp->exp_not_timed = 1;
-                       spin_unlock(&lexp->exp_lock);
-               }
         }
  
         RETURN(rc);
@@ -7514,12 +7508,6 @@ static int mdt_obd_reconnect(const struct lu_env *env,
         else
                 nodemap_del_member(exp);
  
-       if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-               spin_lock(&exp->exp_lock);
-               exp->exp_not_timed = 1;
-               spin_unlock(&exp->exp_lock);
-       }
-
         RETURN(rc);
  }
  
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index bf6aa51..0a56daf 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -58,8 +58,6 @@ EXPORT_SYMBOL(ldlm_timeout);
  unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
                              (OBD_TIMEOUT_DEFAULT / 4) : 1;
  EXPORT_SYMBOL(ping_interval);
-unsigned int ping_evict_timeout_multiplier = 6;
-EXPORT_SYMBOL(ping_evict_timeout_multiplier);
  unsigned int obd_timeout_set;
  EXPORT_SYMBOL(obd_timeout_set);
  unsigned int ldlm_timeout_set;
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index e65ef9f..b9f1557 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -378,7 +378,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
         newdev->obd_grant_check_threshold = 100;
         INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
         INIT_LIST_HEAD(&newdev->obd_delayed_exports);
-       INIT_LIST_HEAD(&newdev->obd_exports_timed);
+       newdev->obd_exports_timed.rb_node = NULL;
         INIT_LIST_HEAD(&newdev->obd_nid_stats);
         spin_lock_init(&newdev->obd_nid_lock);
         spin_lock_init(&newdev->obd_dev_lock);
@@ -982,7 +982,7 @@ static struct obd_export *__class_new_export(struct obd_device *obd,
         spin_lock_init(&export->exp_bl_list_lock);
         INIT_LIST_HEAD(&export->exp_bl_list);
         INIT_LIST_HEAD(&export->exp_stale_list);
-       INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+       INIT_LIST_HEAD(&export->exp_timed_chain);
         INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
  
         export->exp_sp_peer = LUSTRE_SP_ANY;
@@ -1040,6 +1040,118 @@ struct obd_export *class_new_export_self(struct obd_device *obd,
         return __class_new_export(obd, uuid, true);
  }
  
+struct rb_node_exp_deadline {
+       struct rb_node    ned_node;
+       struct list_head  ned_head;
+       time64_t          ned_deadline;
+};
+
+static inline bool ptlrpc_exp_deadline_less(struct rb_node *ln,
+                                           const struct rb_node *rn)
+{
+       struct rb_node_exp_deadline *left, *right;
+
+       left = rb_entry(ln, struct rb_node_exp_deadline, ned_node);
+       right = rb_entry(rn, struct rb_node_exp_deadline, ned_node);
+
+       return left->ned_deadline < right->ned_deadline;
+}
+
+static inline int ptlrpc_exp_deadline_cmp(const void *key,
+                                         const struct rb_node *node)
+{
+       struct rb_node_exp_deadline *ned;
+       time64_t *time = (time64_t *)key;
+
+       ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+       return (*time < ned->ned_deadline ? -1 :
+               *time > ned->ned_deadline ?  1 : 0);
+}
+
+int obd_export_timed_init(struct obd_export *exp, void **data)
+
+{
+       OBD_ALLOC(*data, sizeof(struct rb_node_exp_deadline));
+       return data == NULL ? -ENOMEM : 0;
+}
+EXPORT_SYMBOL(obd_export_timed_init);
+
+void obd_export_timed_fini(struct obd_export *exp, void **data)
+{
+       if (*data) {
+               OBD_FREE(*data, sizeof(struct rb_node_exp_deadline));
+               *data = NULL;
+       }
+}
+EXPORT_SYMBOL(obd_export_timed_fini);
+
+void obd_export_timed_add(struct obd_export *exp, void **data)
+{
+       struct rb_node_exp_deadline *ned = *data;
+       struct rb_node *node;
+
+       node = rb_find(&exp->exp_deadline, &exp->exp_obd->obd_exports_timed,
+                      ptlrpc_exp_deadline_cmp);
+
+       if (node == NULL) {
+               LASSERT(ned != NULL);
+               INIT_LIST_HEAD(&ned->ned_head);
+               RB_CLEAR_NODE(&ned->ned_node);
+               ned->ned_deadline = exp->exp_deadline;
+               *data = NULL;
+
+               rb_add(&ned->ned_node, &exp->exp_obd->obd_exports_timed,
+                      ptlrpc_exp_deadline_less);
+       } else {
+               ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+               LASSERT(!list_empty(&ned->ned_head));
+       }
+
+       list_add_tail(&exp->exp_timed_chain, &ned->ned_head);
+}
+EXPORT_SYMBOL(obd_export_timed_add);
+
+void obd_export_timed_del(struct obd_export *exp)
+{
+       struct rb_node_exp_deadline *ned;
+
+       if (list_empty(&exp->exp_timed_chain))
+               return;
+
+       ned = rb_entry(rb_find(&exp->exp_deadline,
+                              &exp->exp_obd->obd_exports_timed,
+                              ptlrpc_exp_deadline_cmp),
+                      struct rb_node_exp_deadline, ned_node);
+       LASSERT(!list_empty(&ned->ned_head));
+       LASSERT(ned->ned_deadline == exp->exp_deadline);
+       list_del_init(&exp->exp_timed_chain);
+
+       if (list_empty(&ned->ned_head)) {
+               rb_erase(&ned->ned_node, &exp->exp_obd->obd_exports_timed);
+               OBD_FREE_PTR(ned);
+       }
+}
+EXPORT_SYMBOL(obd_export_timed_del);
+
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last)
+{
+       struct rb_node_exp_deadline *ned;
+       struct rb_node *node;
+
+       node = last ? rb_last(&obd->obd_exports_timed) :
+               rb_first(&obd->obd_exports_timed);
+
+       if (node == NULL)
+               return NULL;
+
+       ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+       LASSERT(!list_empty(&ned->ned_head));
+
+       return list_first_entry(&ned->ned_head, struct obd_export,
+                               exp_timed_chain);
+}
+EXPORT_SYMBOL(obd_export_timed_get);
+
  void class_unlink_export(struct obd_export *exp)
  {
         class_handle_unhash(&exp->exp_handle);
@@ -1071,7 +1183,7 @@ void class_unlink_export(struct obd_export *exp)
  #endif /* HAVE_SERVER_SUPPORT */
  
         list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
-       list_del_init(&exp->exp_obd_chain_timed);
+       obd_export_timed_del(exp);
         exp->exp_obd->obd_num_exports--;
         spin_unlock(&exp->exp_obd->obd_dev_lock);
  
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c

index 9ab855c..9a047d5 100644 (file)
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -640,9 +640,6 @@ int class_attach(struct lustre_cfg *lcfg)
         }
  
         obd->obd_self_export = exp;
-       spin_lock(&exp->exp_lock);
-       exp->exp_not_timed = 1;
-       spin_unlock(&exp->exp_lock);
         class_export_put(exp);
  
         rc = class_register_device(obd);
diff --git a/lustre/obdclass/obd_sysfs.c b/lustre/obdclass/obd_sysfs.c

index 1006526..832731b 100644 (file)
--- a/lustre/obdclass/obd_sysfs.c
+++ b/lustre/obdclass/obd_sysfs.c
@@ -110,7 +110,6 @@ LUSTRE_STATIC_UINT_ATTR(at_unhealthy_factor, &at_unhealthy_factor);
  LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header);
  LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
  LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
-LUSTRE_STATIC_UINT_ATTR(evict_multiplier, &ping_evict_timeout_multiplier);
  
  #ifdef HAVE_SERVER_SUPPORT
  LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
@@ -516,7 +515,6 @@ static struct attribute *lustre_attrs[] = {
         &lustre_attr_enable_fname_encoding.attr,
         &lustre_sattr_lbug_on_eviction.u.attr,
         &lustre_sattr_ping_interval.u.attr,
-       &lustre_sattr_evict_multiplier.u.attr,
         NULL,
  };
  
diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c

index 4760b58..c0710e7 100644 (file)
--- a/lustre/obdecho/echo_client.c
+++ b/lustre/obdecho/echo_client.c
@@ -2475,13 +2475,6 @@ static int echo_client_setup(const struct lu_env *env,
         ocd->ocd_group = FID_SEQ_ECHO;
  
         rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
-       if (rc == 0) {
-               /* Turn off pinger because it connects to tgt obd directly. */
-               spin_lock(&ec->ec_exp->exp_lock);
-               ec->ec_exp->exp_not_timed = 1;
-               spin_unlock(&ec->ec_exp->exp_lock);
-       }
-
         OBD_FREE(ocd, sizeof(*ocd));
  
         if (rc != 0) {
diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c

index 3fe0b4f..a321643 100644 (file)
--- a/lustre/ofd/ofd_obd.c
+++ b/lustre/ofd/ofd_obd.c
@@ -235,14 +235,13 @@ static int ofd_parse_connect_data(const struct lu_env *env,
  
         data->ocd_version = LUSTRE_VERSION_CODE;
  
-       if (OCD_HAS_FLAG(data, PINGLESS)) {
-               if (ptlrpc_pinger_suppress_pings()) {
-                       spin_lock(&exp->exp_lock);
-                       exp->exp_not_timed = 1;
-                       spin_unlock(&exp->exp_lock);
-               } else {
-                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
-               }
+       if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+               data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+       if (!OCD_HAS_FLAG(data, PINGLESS)) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_timed = 1;
+               spin_unlock(&exp->exp_lock);
         }
  
         if (!ofd->ofd_lut.lut_dt_conf.ddp_has_lseek_data_hole)
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c

index a04b889..c758d58 100644 (file)
--- a/lustre/ptlrpc/pinger.c
+++ b/lustre/ptlrpc/pinger.c
@@ -445,7 +445,7 @@ static int ping_evictor_main(void *arg)
  {
         struct obd_device *obd;
         struct obd_export *exp;
-       time64_t expire_time;
+       time64_t current_time;
         struct lu_env env;
         int rc;
  
@@ -485,10 +485,9 @@ static int ping_evictor_main(void *arg)
                         CFS_FAIL_TIMEOUT(OBD_FAIL_OBD_PAUSE_EVICTOR,
                                          PING_INTERVAL + PING_EVICT_TIMEOUT);
  
-               expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
+               current_time = ktime_get_real_seconds();
  
-               CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
-                      obd->obd_name, expire_time);
+               CDEBUG(D_HA, "evicting all exports of obd %s\n", obd->obd_name);
  
                 /*
                  * Exports can't be deleted out of the list while we hold
@@ -497,24 +496,21 @@ static int ping_evictor_main(void *arg)
                  * removed from the list, we won't find them here.
                  */
                 spin_lock(&obd->obd_dev_lock);
-               while (!list_empty(&obd->obd_exports_timed)) {
-                       exp = list_first_entry(&obd->obd_exports_timed,
-                                              struct obd_export,
-                                              exp_obd_chain_timed);
-                       if (expire_time > exp->exp_last_request_time) {
+               while((exp = obd_export_timed_get(obd, false))) {
+                       if (current_time > exp->exp_deadline) {
                                 struct obd_uuid *client_uuid;
  
                                 class_export_get(exp);
                                 client_uuid = &exp->exp_client_uuid;
                                 spin_unlock(&obd->obd_dev_lock);
-                               LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n",
+                               LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld deadline %lld last %lld\n",
                                               obd->obd_name,
                                               obd_uuid2str(client_uuid),
                                               obd_export_nid2str(exp),
                                               ktime_get_real_seconds() -
                                               exp->exp_last_request_time,
-                                             exp, ktime_get_real_seconds(),
-                                             expire_time,
+                                             exp, current_time,
+                                             exp->exp_deadline,
                                               exp->exp_last_request_time);
                                 CDEBUG(D_HA, "Last request was at %lld\n",
                                        exp->exp_last_request_time);
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index db55471..89bae0d 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1095,34 +1095,158 @@ static void ptlrpc_server_finish_active_request(
  }
  
  /**
+ * Calculate an export eviction timeout.
+ * Used for both cases, lock prolong timeout and ping evictor timeout.
+ *
+ * Whereas a problem client may be still alive trying hard to reconnect and to
+ * resend its RPCs, we should not consider the worst ever case, consisting of
+ * a chain of failures on each step. Let this timeout survive a recovery of
+ * just 1 failure, but let this be the worst possible one - a dead server NID:
+ *
+ * - an RPC timeout;
+ * - the first re-connect is sent to the same NID and times out;
+ * - the second re-connect to the failover pair returns an error;
+ * - the third re-connect to the original node to a different NID succeeds;
+ * - the RPC resend succeeds;
+ *
+ * For lock prolong timeout, we are in the middle of the process -
+ * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
+ * and at least 1 another RPC (which will trigger another refresh if it will be
+ * not CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
+ * from the RPC (i.e. the view of the client on the current AT) is taken into
+ * account.
+ *
+ * \param[in] at             AT of RPC service time to calculate timeout for
+ * \param[in] net_at         network AT
+ * \param[in] rpc_left_time   left service time for the current RPC
+ *                            0 if not applicable
+ * \param[in] pinger         if the caller is ping evictor or ldlm
+ *
+ * \retval             timeout in seconds to wait for the next client's RPC
+ */
+static timeout_t ptlrpc_export_timeout(struct obd_device *obd,
+                                      struct adaptive_timeout *at,
+                                      timeout_t netl,
+                                      timeout_t rpc_left_time,
+                                      bool pinger)
+{
+       timeout_t timeout, at_timeout, req_timeout;
+
+       if (obd_at_off(obd))
+               return obd_timeout / 2;
+
+       if (pinger) {
+               /* There might be a delay till the next RPC. In fact it is two
+                * PING_INTERVALs due to ptlrpc_pinger_main logic. */
+               timeout = 2 * PING_INTERVAL;
+       } else {
+               /* For the lock prolong, we have an RPC in hand, which may still
+                * get its reply lost. Therefore, it may be either this one or
+                * the next client's RPC times out, take the max.
+                * Considering the current RPC, take just the left time. */
+               LASSERT(at != NULL);
+               at_timeout = at_est2timeout(obd_at_get(obd, at)) + netl;
+               req_timeout = max(rpc_left_time + netl, at_timeout);
+               /* Adding the RPC resend time - not needed in the ping evictor
+                * case, export is updated on re-connect  */
+               timeout = req_timeout + at_timeout;
+       }
+
+       /* Adding the re-connect time: 1st re-connect timeout,
+        * 2nd reconnect error, 3rd reconnect success. */
+       timeout += 3 * (INITIAL_CONNECT_TIMEOUT + netl);
+
+       /* Let's be a bit more conservative than client */
+       return max(timeout + (timeout >> 4),
+                  (timeout_t)obd_get_ldlm_enqueue_min(obd));
+}
+
+/**
+ * Used for lock prolog timeout, calculates a timeout for CANCEL to come.
+ * Also used for recovery, calculates a timeout for a next recovery RPC to come.
+ * In this case, there is an RPC, in hand. Thus, a particular svcpt AT is used.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+                                       bool recovery)
+{
+       timeout_t netl;
+
+       if (recovery)
+               netl = lustre_msg_get_service_timeout(req->rq_reqmsg);
+       else
+               netl = obd_at_get(req->rq_export->exp_obd,
+                                 &req->rq_export->exp_imp_reverse->
+                                 imp_at.iat_net_latency);
+
+       return ptlrpc_export_timeout(req->rq_export->exp_obd,
+                                    &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+                                    netl, req->rq_deadline -
+                                    ktime_get_real_seconds(), false);
+}
+
+/**
+ * Used for ping evictor, calculates a timeout for any next RPC to come.
+ * As there are different portals and the AT stats is separated for them,
+ * just the last RPC AT is used here.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+static timeout_t ptlrpc_export_pinger_timeout(struct ptlrpc_request *req)
+{
+       struct obd_import *revimp = req->rq_export->exp_imp_reverse;
+       timeout_t netl = obd_at_get(req->rq_export->exp_obd,
+                                   &revimp->imp_at.iat_net_latency);
+
+       return ptlrpc_export_timeout(req->rq_export->exp_obd,
+                                    &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+                                    netl, 0, true);
+}
+
+/**
+ * In case the net was down and just came back, when the 1st timeout has been
+ * already expired, clients just keep sending re-connects. Applying the same
+ * formula as in ptlrpc_export_timeout() to this case we get:
+ * - a previous reconnect to not yet recovered network, times out;
+ * - the second reconnect to the failover pair, ENODEV;
+ * - the third reconnect succeeds;
+ */
+static timeout_t ptlrpc_export_extra_timeout(struct obd_export *exp)
+{
+       timeout_t netl;
+
+       /* As this is not the 1st re-connection failure, the client might
+        * have net latency get extended to the max - CONNECTION_SWITCH_MAX */
+       netl = obd_at_get(exp->exp_obd,
+                         &exp->exp_imp_reverse->imp_at.iat_net_latency);
+       return 3 * INITIAL_CONNECT_TIMEOUT + CONNECTION_SWITCH_MAX + 2 * netl;
+}
+
+/**
   * This function makes sure dead exports are evicted in a timely manner.
   * This function is only called when some export receives a message (i.e.,
   * the network is up.)
   */
-void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
+void ptlrpc_update_export_timer(struct ptlrpc_request *req)
  {
-       struct obd_export *oldest_exp, *newest_exp;
-       time64_t oldest_time, current_time;
-       bool    evict = false;
+       struct obd_export *oldest_exp, *newest_exp, *exp;
+       time64_t current_time, timeout;
+       bool evict = false;
+       void *data;
+       int rc;
         ENTRY;
  
-       LASSERT(exp);
-
-       /*
-        * Compensate for slow machines, etc, by faking our request time
-        * into the future.  Although this can break the strict time-ordering
-        * of the list, we can be really lazy here - we don't have to evict
-        * at the exact right moment.  Eventually, all silent exports
-        * will make it to the top of the list.
-        */
+       LASSERT(req != NULL);
+       LASSERT(req->rq_export != NULL);
  
-       /* Do not pay attention on 1sec or smaller renewals. */
+       exp = req->rq_export;
         current_time = ktime_get_real_seconds();
-       /* 1 seconds */
-       if (exp->exp_last_request_time + 1 >= current_time + extra_delay)
-               RETURN_EXIT;
  
-       exp->exp_last_request_time = current_time + extra_delay;
+       rc = obd_export_timed_init(exp, &data);
+       if (rc)
+               /* will be updated next time */
+               RETURN_EXIT;
  
         /*
          * exports may get disconnected from the chain even though the
@@ -1130,54 +1254,60 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
          * manipulating the lists
          */
         spin_lock(&exp->exp_obd->obd_dev_lock);
-
-       if (list_empty(&exp->exp_obd_chain_timed)) {
+       if (list_empty(&exp->exp_timed_chain)) {
                 /* this one is not timed */
                 spin_unlock(&exp->exp_obd->obd_dev_lock);
-               RETURN_EXIT;
+               GOTO(err, 0);
         }
  
-       newest_exp = list_last_entry(&exp->exp_obd->obd_exports_timed,
-                                    struct obd_export, exp_obd_chain_timed);
+       exp->exp_last_request_time = current_time;
  
-       list_move_tail(&exp->exp_obd_chain_timed,
-                      &exp->exp_obd->obd_exports_timed);
+       timeout = ptlrpc_export_pinger_timeout(req);
  
-       if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
-               /* be nice to everyone during recovery */
+       /* Do not pay attention on 1sec or smaller renewals. */
+       if (exp->exp_deadline + 1 >= current_time + timeout) {
                 spin_unlock(&exp->exp_obd->obd_dev_lock);
-               RETURN_EXIT;
+               GOTO(err, 0);
         }
  
-       oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
-                               struct obd_export, exp_obd_chain_timed);
+       newest_exp = obd_export_timed_get(exp->exp_obd, true);
+       obd_export_timed_del(exp);
+       exp->exp_deadline = current_time + timeout;
+       obd_export_timed_add(exp, &data);
  
-       oldest_time = oldest_exp->exp_last_request_time;
+       if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
+               /* be nice to everyone during recovery */
+               spin_unlock(&exp->exp_obd->obd_dev_lock);
+               GOTO(err, 0);
+       }
+       oldest_exp = obd_export_timed_get(exp->exp_obd, false);
  
         /* Check if the oldest entry is expired. */
-       if (exp->exp_obd->obd_eviction_timer == 0 &&
-           current_time > oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
-
-               if (current_time < newest_exp->exp_last_request_time +
-                            PING_EVICT_TIMEOUT / 2) {
-                       /* If import is active - evict stale clients */
-                       evict = true;
-               } else {
-                       /*
-                        * We need a second timer, in case the net was down and
-                        * it just came back. Since the pinger may skip every
-                        * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-                        * we better wait for 3.
-                        */
-                       exp->exp_obd->obd_eviction_timer =
-                               ktime_get_real_seconds() + 3 * PING_INTERVAL;
-                       CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
-                              exp->exp_obd->obd_name,
-                              obd_export_nid2str(oldest_exp), oldest_time);
-
+       if (exp->exp_obd->obd_eviction_timer == 0) {
+               if (current_time > oldest_exp->exp_deadline) {
+                       timeout = newest_exp->exp_last_request_time +
+                               ((newest_exp->exp_deadline -
+                                 newest_exp->exp_last_request_time) >> 1);
+                       if (current_time < timeout) {
+                               /* If import is active - evict stale clients */
+                               evict = true;
+                       } else {
+                               /*
+                                * We need a second timer, in case the net was
+                                * down and it just came back.
+                                */
+                               exp->exp_obd->obd_eviction_timer =
+                                       ktime_get_real_seconds() +
+                                       ptlrpc_export_extra_timeout(oldest_exp);
+                               CDEBUG(D_HA, "%s: Think about evicting %s "
+                                      "from %lld deadline at %lld\n",
+                                      exp->exp_obd->obd_name,
+                                      obd_export_nid2str(oldest_exp),
+                                      oldest_exp->exp_deadline,
+                                      exp->exp_obd->obd_eviction_timer);
+                       }
                 }
         }
-
         spin_unlock(&exp->exp_obd->obd_dev_lock);
  
         if (evict) {
@@ -1185,7 +1315,7 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
                 ping_evictor_wake(exp);
         } else {
                 if (ktime_get_real_seconds() >
-                   (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                   exp->exp_obd->obd_eviction_timer) {
                         /*
                          * The evictor won't evict anyone who we've heard from
                          * recently, so we don't have to check before we start
@@ -1197,6 +1327,8 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
         }
  
         EXIT;
+err:
+       obd_export_timed_fini(exp, &data);
  }
  
  /**
@@ -2171,7 +2303,8 @@ static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
  
                 if (rc)
                         goto err_req;
-               ptlrpc_update_export_timer(req->rq_export, 0);
+
+               ptlrpc_update_export_timer(req);
         }
  
         /* req_in handling should/must be fast */
@@ -2306,9 +2439,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
         if (likely(request->rq_export)) {
                 if (unlikely(ptlrpc_check_req(request)))
                         goto put_conn;
-               ptlrpc_update_export_timer(request->rq_export,
-                                          div_u64(timediff_usecs,
-                                                  USEC_PER_SEC / 2));
+
+               ptlrpc_update_export_timer(request);
         }
  
         /*
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 4c8f61d..e580307 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -1134,23 +1134,26 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
                 lctl get_param -n mdt.${mds1_svc}.num_exports)
         local ost_nexp=$(do_facet ost1 \
                 lctl get_param -n obdfilter.${ost1_svc}.num_exports)
+       # must be equal on all the nodes
+       local INTERVAL=$(do_facet $SINGLEMDS lctl get_param -n ping_interval)
+       local AT_MAX_SAVED=$(at_max_get mds1)
+
+       at_max_set $TIMEOUT mds1
+       at_max_set $TIMEOUT ost1
+       stack_trap "at_max_set $AT_MAX_SAVED mds1" EXIT
+       stack_trap "at_max_set $AT_MAX_SAVED ost1" EXIT
  
         echo "starting with '$ost_nexp' OST and '$mds_nexp' MDS exports"
  
         zconf_umount $HOSTNAME $MOUNT2 -f
  
-       # PING_INTERVAL max(obd_timeout / 4, 1U)
-       # PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
-
-       # evictor takes PING_EVICT_TIMEOUT to evict.
-       # But if there's a race to start the evictor from various obds,
-       # the loser might have to wait for the next ping.
-       # = 6 * PING_INTERVAL + PING_INTERVAL
-       # = 7 PING_INTERVAL = 7 obd_timeout / 4 =  (1+3/4)obd_timeout
-       # let's wait $((TIMEOUT * 2)) # bug 19887
-       wait_client_evicted ost1 $ost_nexp $((TIMEOUT * 2)) ||
+       # see ptlrpc_export_timeout() for the pinger case; take a bit more the test sake
+       local TOUT=$((INTERVAL * 2 + (TIMEOUT / 20 + 5 + TIMEOUT) * 3))
+       TOUT=$((TOUT + (TOUT >> 3)))
+       echo i $INTERVAL m $AT_MAX_SAVED t $TIMEOUT $TOUT
+       wait_client_evicted ost1 $ost_nexp $TOUT ||
                 error "Client was not evicted by OSS"
-       wait_client_evicted mds1 $mds_nexp $((TIMEOUT * 2)) ||
+       wait_client_evicted mds1 $mds_nexp $TOUT ||
                 error "Client was not evicted by MDS"
  }
  run_test 26b "evict dead exports"
author	Vitaly Fertman <c17818@cray.com>
	Mon, 31 Mar 2025 17:51:28 +0000 (20:51 +0300)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 16 Apr 2025 20:41:28 +0000 (20:41 +0000)
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/include/lustre_export.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_class.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_extent.c		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/mdt/mdt_handler.c		patch \| blob \| history
lustre/obdclass/class_obd.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/obdclass/obd_config.c		patch \| blob \| history
lustre/obdclass/obd_sysfs.c		patch \| blob \| history
lustre/obdecho/echo_client.c		patch \| blob \| history
lustre/ofd/ofd_obd.c		patch \| blob \| history
lustre/ptlrpc/pinger.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history