Whamcloud - gitweb
LU-18878 ptlrpc: improve ping evictor and recovery timeout 20/58620/3
authorVitaly Fertman <c17818@cray.com>
Mon, 31 Mar 2025 17:51:28 +0000 (20:51 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 16 Apr 2025 20:41:28 +0000 (20:41 +0000)
Let's re-use the logic used for extending the ldlm prolong timeout for
the ping evictor timeout and for recovery timer timeout - use the AT
instead of hard-coded values.

As the AT-based ping timeout is extended on different value each time,
moving an updated export to the end of the timed list is able to make
it not well sorted. Change the list of timed exports, to the rbtree.

Partially revert LUS-11054, tunable ping_evict_timeout_multiplier is
not needed anymore

A minor cleanup of exp_timed flag on connect/reconnect.

Fixes: 8e66f061c ("LU-16002 ptlrpc: adds configurable ping interval”)
HPE-bug-id: LUS-11723
Signed-off-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Change-Id: I97b9795746ccd4242f9798c8192abd4d7a57bbac
Reviewed-on: https://es-gerrit.hpc.amslabs.hpecorp.net/162935
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58620
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
19 files changed:
lustre/include/lustre_dlm.h
lustre/include/lustre_export.h
lustre/include/lustre_net.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lockd.c
lustre/mdt/mdt_handler.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/obd_config.c
lustre/obdclass/obd_sysfs.c
lustre/obdecho/echo_client.c
lustre/ofd/ofd_obd.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/service.c
lustre/tests/recovery-small.sh

index a1ec2cf..bdb5485 100644 (file)
@@ -1515,7 +1515,6 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 
 void ldlm_revoke_export_locks(struct obd_export *exp);
 timeout_t ldlm_bl_timeout(struct ldlm_lock *lock);
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req);
 #endif
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
 int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
index 844fc4f..afdc282 100644 (file)
@@ -196,7 +196,7 @@ struct obd_export {
         * order
         * protected by obd_dev_lock
         */
-       struct list_head        exp_obd_chain_timed;
+       struct list_head        exp_timed_chain;
        /** Obd device of this export */
        struct obd_device      *exp_obd;
        /**
@@ -223,6 +223,7 @@ struct obd_export {
        __u64                   exp_last_committed;
        /** When was last request received */
        time64_t                exp_last_request_time;
+       time64_t                exp_deadline;
        /** On replay all requests waiting for replay are linked here */
        struct list_head        exp_req_replay_queue;
        /**
@@ -260,7 +261,7 @@ struct obd_export {
                                 */
                                exp_old_falloc:1,
                                exp_hashed:1,
-                               exp_not_timed:1;
+                               exp_timed:1;
        /* also protected by exp_lock */
        enum lustre_sec_part    exp_sp_peer;
        struct sptlrpc_flavor   exp_flvr;               /* current */
index 5049c6b..c608124 100644 (file)
@@ -2261,8 +2261,9 @@ int ptlrpc_service_health_check(struct ptlrpc_service *service);
 void ptlrpc_server_drop_request(struct ptlrpc_request *req);
 void ptlrpc_request_change_export(struct ptlrpc_request *req,
                                  struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp,
-                               time64_t extra_delay);
+void ptlrpc_update_export_timer(struct ptlrpc_request *req);
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+                                       bool recovery);
 
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
index 5a82ce2..af7e508 100644 (file)
@@ -683,7 +683,7 @@ struct obd_device {
        struct obd_export       *obd_self_export;
        struct obd_export       *obd_lwp_export;
        /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
-       struct list_head        obd_exports_timed;
+       struct rb_root          obd_exports_timed;
        time64_t                obd_eviction_timer;     /* for ping evictor */
 
        atomic_t                obd_max_recoverable_clients;
index 00b2448..f04f318 100644 (file)
@@ -406,6 +406,12 @@ void class_import_put(struct obd_import *);
 struct obd_import *class_new_import(struct obd_device *obd);
 void class_destroy_import(struct obd_import *exp);
 
+int obd_export_timed_init(struct obd_export *exp, void **data);
+void obd_export_timed_fini(struct obd_export *exp, void **data);
+void obd_export_timed_add(struct obd_export *exp, void **data);
+void obd_export_timed_del(struct obd_export *exp);
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last);
+
 #ifdef HAVE_SERVER_SUPPORT
 struct obd_type *class_search_type(const char *name);
 struct obd_type *class_get_type(const char *name);
index 589f052..9e7a4bd 100644 (file)
@@ -41,7 +41,6 @@ extern unsigned int obd_lbug_on_eviction;
 extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
 extern unsigned int ping_interval;        /* seconds */
-extern unsigned int ping_evict_timeout_multiplier;
 extern unsigned int obd_timeout_set;
 extern unsigned int ldlm_timeout_set;
 extern unsigned int bulk_timeout;
@@ -90,7 +89,7 @@ extern bool obd_enable_fname_encoding;
  * and there's no urgent need to evict a client just because it's idle, we
  * should be very conservative here.
  */
-#define PING_EVICT_TIMEOUT (PING_INTERVAL * ping_evict_timeout_multiplier)
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
 #define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
 #define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
  /* Max connect interval for nonresponsive servers; ~50s to avoid building up
index ae061bd..04825c9 100644 (file)
@@ -770,7 +770,7 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
        /* OK. this is a possible lock the user holds doing I/O
         * let's refresh eviction timer for it.
         */
-       timeout = ldlm_bl_timeout_by_rpc(arg->lpa_req);
+       timeout = ptlrpc_export_prolong_timeout(arg->lpa_req, false);
        LDLM_DEBUG(lock, "refreshed to %ds. ", timeout);
        ldlm_refresh_waiting_lock(lock, timeout);
 }
index c24d892..80f5699 100644 (file)
@@ -980,6 +980,7 @@ int rev_import_init(struct obd_export *export)
 {
        struct obd_device *obd = export->exp_obd;
        struct obd_import *revimp;
+       int rc = 0;
 
        LASSERT(export->exp_imp_reverse == NULL);
 
@@ -997,13 +998,22 @@ int rev_import_init(struct obd_export *export)
        spin_unlock(&export->exp_lock);
        class_import_put(revimp);
 
-       if (!export->exp_not_timed) {
-               spin_lock(&obd->obd_dev_lock);
-               list_add_tail(&export->exp_obd_chain_timed,
-                             &obd->obd_exports_timed);
-               spin_unlock(&obd->obd_dev_lock);
+       if (export->exp_timed) {
+               void *data;
+
+               rc = obd_export_timed_init(export, &data);
+               if (rc == 0) {
+                       spin_lock(&obd->obd_dev_lock);
+                       /* At the beginning, there is no AT stats yet, use
+                        * previous approach for the ping evictor timeout */
+                       export->exp_deadline =
+                               PING_EVICT_TIMEOUT + ktime_get_real_seconds();
+                       obd_export_timed_add(export, &data);
+                       spin_unlock(&obd->obd_dev_lock);
+                       obd_export_timed_fini(export, &data);
+               }
        }
-       return 0;
+       return rc;
 }
 EXPORT_SYMBOL(rev_import_init);
 
@@ -1497,7 +1507,7 @@ dont_check_exports:
                         * should be called to cleanup stuff
                         */
                        spin_lock(&target->obd_dev_lock);
-                       list_del_init(&export->exp_obd_chain_timed);
+                       obd_export_timed_del(export);
                        spin_unlock(&target->obd_dev_lock);
 
                        class_export_get(export);
@@ -2430,29 +2440,8 @@ static void handle_recovery_req(struct ptlrpc_thread *thread,
                 * Add request @timeout to the recovery time so next request from
                 * this client may come in recovery time
                 */
-               if (!obd_at_off(obd)) {
-                       struct ptlrpc_service_part *svcpt;
-                       timeout_t est_timeout;
-
-                       svcpt = req->rq_rqbd->rqbd_svcpt;
-                       /*
-                        * If the server sent early reply for this request,
-                        * the client will recalculate the timeout according to
-                        * current server estimate service time, so we will
-                        * use the maxium timeout here for waiting the client
-                        * sending the next req
-                        */
-                       est_timeout = obd_at_get(obd, &svcpt->scp_at_estimate);
-                       timeout = max_t(timeout_t, at_est2timeout(est_timeout),
-                                       lustre_msg_get_timeout(req->rq_reqmsg));
-                       /*
-                        * Add 2 net_latency, one for balance rq_deadline
-                        * (see ptl_send_rpc), one for resend the req to server,
-                        * Note: client will pack net_latency in replay req
-                        * (see ptlrpc_replay_req)
-                        */
-                       timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
-               }
+               if (!obd_at_off(obd))
+                       timeout = ptlrpc_export_prolong_timeout(req, true);
                extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
                                      true);
        }
@@ -2843,7 +2832,7 @@ static int target_recovery_thread(void *arg)
                 * so we need refresh the last_request_time, to avoid the
                 * export is being evicted
                 */
-               ptlrpc_update_export_timer(req->rq_export, 0);
+               ptlrpc_update_export_timer(req);
        }
 
        /*
index 9a292fb..3d3c1c3 100644 (file)
@@ -648,59 +648,6 @@ timeout_t ldlm_bl_timeout(struct ldlm_lock *lock)
 EXPORT_SYMBOL(ldlm_bl_timeout);
 
 /**
- * Calculate the per-export Blocking timeout by the given RPC (covering the
- * reply to this RPC and the next RPC). The next RPC could be still not CANCEL,
- * but having the lock refresh mechanism it is enough.
- *
- * Used for lock refresh timeout when we are in the middle of the process -
- * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
- * and at least 1 RPC (which will trigger another refresh if it will be not
- * CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
- * from the RPC (i.e. the view of the client on the current AT) is taken into
- * account.
- *
- * \param[in] req     req which export needs the timeout calculation
- *
- * \retval            timeout in seconds to wait for the next client's RPC
- */
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req)
-{
-       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
-       timeout_t timeout, req_timeout, at_timeout, netl;
-       struct obd_device *obd = req->rq_export->exp_obd;
-
-       if (obd_at_off(obd))
-               return obd_timeout / 2;
-
-       /* A blocked lock means somebody in the cluster is waiting, and we
-        * should not consider the worst ever case, consisting of a chain of
-        * failures on each step, however this timeout should survive a
-        * recovery of at least 1 failure, let this one to be the worst one:
-        * in case a server NID is dead first re-connect is done through the
-        * same router and also times out.
-        *
-        * Either this on the next RPC times out, take the max.
-        * Considering the current RPC, take just the left time.
-        */
-       netl = obd_at_get(obd,
-                         &req->rq_export->exp_imp_reverse->imp_at.iat_net_latency);
-       req_timeout = req->rq_deadline - ktime_get_real_seconds() + netl;
-       at_timeout = at_est2timeout(obd_at_get(obd, &svcpt->scp_at_estimate))
-                                   + netl;
-       req_timeout = max(req_timeout, at_timeout);
-
-       /* Take 1 re-connect failure and 1 re-connect success into account. */
-       timeout = at_timeout + INITIAL_CONNECT_TIMEOUT + netl + req_timeout;
-
-       /* Client's timeout is calculated as at_est2timeout(), let's be a bit
-        * more conservative than client
-        */
-       return max(timeout + (timeout >> 4),
-                  (timeout_t)obd_get_ldlm_enqueue_min(obd));
-}
-EXPORT_SYMBOL(ldlm_bl_timeout_by_rpc);
-
-/**
  * Perform lock cleanup if AST sending failed.
  */
 static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
index 1187f06..fa23671 100644 (file)
@@ -7183,14 +7183,16 @@ static int mdt_connect_internal(const struct lu_env *env,
                return -EBADE;
        }
 
-       if (OCD_HAS_FLAG(data, PINGLESS)) {
-               if (ptlrpc_pinger_suppress_pings()) {
-                       spin_lock(&exp->exp_lock);
-                       exp->exp_not_timed = 1;
-                       spin_unlock(&exp->exp_lock);
-               } else {
-                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
-               }
+       if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+               data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+       /* Because we do not want this export to be evicted by pinger,
+        * let's not add this export to the timed chain list. */
+       if (!OCD_HAS_FLAG(data, PINGLESS) &&
+           !(data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_timed = 1;
+               spin_unlock(&exp->exp_lock);
        }
 
        data->ocd_max_easize = mdt->mdt_max_ea_size;
@@ -7476,14 +7478,6 @@ out:
                *exp = NULL;
        } else {
                *exp = lexp;
-               /* Because we do not want this export to be evicted by pinger,
-                * let's not add this export to the timed chain list.
-                */
-               if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-                       spin_lock(&lexp->exp_lock);
-                       lexp->exp_not_timed = 1;
-                       spin_unlock(&lexp->exp_lock);
-               }
        }
 
        RETURN(rc);
@@ -7514,12 +7508,6 @@ static int mdt_obd_reconnect(const struct lu_env *env,
        else
                nodemap_del_member(exp);
 
-       if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
-               spin_lock(&exp->exp_lock);
-               exp->exp_not_timed = 1;
-               spin_unlock(&exp->exp_lock);
-       }
-
        RETURN(rc);
 }
 
index bf6aa51..0a56daf 100644 (file)
@@ -58,8 +58,6 @@ EXPORT_SYMBOL(ldlm_timeout);
 unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
                             (OBD_TIMEOUT_DEFAULT / 4) : 1;
 EXPORT_SYMBOL(ping_interval);
-unsigned int ping_evict_timeout_multiplier = 6;
-EXPORT_SYMBOL(ping_evict_timeout_multiplier);
 unsigned int obd_timeout_set;
 EXPORT_SYMBOL(obd_timeout_set);
 unsigned int ldlm_timeout_set;
index e65ef9f..b9f1557 100644 (file)
@@ -378,7 +378,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name,
        newdev->obd_grant_check_threshold = 100;
        INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
        INIT_LIST_HEAD(&newdev->obd_delayed_exports);
-       INIT_LIST_HEAD(&newdev->obd_exports_timed);
+       newdev->obd_exports_timed.rb_node = NULL;
        INIT_LIST_HEAD(&newdev->obd_nid_stats);
        spin_lock_init(&newdev->obd_nid_lock);
        spin_lock_init(&newdev->obd_dev_lock);
@@ -982,7 +982,7 @@ static struct obd_export *__class_new_export(struct obd_device *obd,
        spin_lock_init(&export->exp_bl_list_lock);
        INIT_LIST_HEAD(&export->exp_bl_list);
        INIT_LIST_HEAD(&export->exp_stale_list);
-       INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+       INIT_LIST_HEAD(&export->exp_timed_chain);
        INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
 
        export->exp_sp_peer = LUSTRE_SP_ANY;
@@ -1040,6 +1040,118 @@ struct obd_export *class_new_export_self(struct obd_device *obd,
        return __class_new_export(obd, uuid, true);
 }
 
+struct rb_node_exp_deadline {
+       struct rb_node    ned_node;
+       struct list_head  ned_head;
+       time64_t          ned_deadline;
+};
+
+static inline bool ptlrpc_exp_deadline_less(struct rb_node *ln,
+                                           const struct rb_node *rn)
+{
+       struct rb_node_exp_deadline *left, *right;
+
+       left = rb_entry(ln, struct rb_node_exp_deadline, ned_node);
+       right = rb_entry(rn, struct rb_node_exp_deadline, ned_node);
+
+       return left->ned_deadline < right->ned_deadline;
+}
+
+static inline int ptlrpc_exp_deadline_cmp(const void *key,
+                                         const struct rb_node *node)
+{
+       struct rb_node_exp_deadline *ned;
+       time64_t *time = (time64_t *)key;
+
+       ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+       return (*time < ned->ned_deadline ? -1 :
+               *time > ned->ned_deadline ?  1 : 0);
+}
+
+int obd_export_timed_init(struct obd_export *exp, void **data)
+
+{
+       OBD_ALLOC(*data, sizeof(struct rb_node_exp_deadline));
+       return data == NULL ? -ENOMEM : 0;
+}
+EXPORT_SYMBOL(obd_export_timed_init);
+
+void obd_export_timed_fini(struct obd_export *exp, void **data)
+{
+       if (*data) {
+               OBD_FREE(*data, sizeof(struct rb_node_exp_deadline));
+               *data = NULL;
+       }
+}
+EXPORT_SYMBOL(obd_export_timed_fini);
+
+void obd_export_timed_add(struct obd_export *exp, void **data)
+{
+       struct rb_node_exp_deadline *ned = *data;
+       struct rb_node *node;
+
+       node = rb_find(&exp->exp_deadline, &exp->exp_obd->obd_exports_timed,
+                      ptlrpc_exp_deadline_cmp);
+
+       if (node == NULL) {
+               LASSERT(ned != NULL);
+               INIT_LIST_HEAD(&ned->ned_head);
+               RB_CLEAR_NODE(&ned->ned_node);
+               ned->ned_deadline = exp->exp_deadline;
+               *data = NULL;
+
+               rb_add(&ned->ned_node, &exp->exp_obd->obd_exports_timed,
+                      ptlrpc_exp_deadline_less);
+       } else {
+               ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+               LASSERT(!list_empty(&ned->ned_head));
+       }
+
+       list_add_tail(&exp->exp_timed_chain, &ned->ned_head);
+}
+EXPORT_SYMBOL(obd_export_timed_add);
+
+void obd_export_timed_del(struct obd_export *exp)
+{
+       struct rb_node_exp_deadline *ned;
+
+       if (list_empty(&exp->exp_timed_chain))
+               return;
+
+       ned = rb_entry(rb_find(&exp->exp_deadline,
+                              &exp->exp_obd->obd_exports_timed,
+                              ptlrpc_exp_deadline_cmp),
+                      struct rb_node_exp_deadline, ned_node);
+       LASSERT(!list_empty(&ned->ned_head));
+       LASSERT(ned->ned_deadline == exp->exp_deadline);
+       list_del_init(&exp->exp_timed_chain);
+
+       if (list_empty(&ned->ned_head)) {
+               rb_erase(&ned->ned_node, &exp->exp_obd->obd_exports_timed);
+               OBD_FREE_PTR(ned);
+       }
+}
+EXPORT_SYMBOL(obd_export_timed_del);
+
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last)
+{
+       struct rb_node_exp_deadline *ned;
+       struct rb_node *node;
+
+       node = last ? rb_last(&obd->obd_exports_timed) :
+               rb_first(&obd->obd_exports_timed);
+
+       if (node == NULL)
+               return NULL;
+
+       ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+       LASSERT(!list_empty(&ned->ned_head));
+
+       return list_first_entry(&ned->ned_head, struct obd_export,
+                               exp_timed_chain);
+}
+EXPORT_SYMBOL(obd_export_timed_get);
+
 void class_unlink_export(struct obd_export *exp)
 {
        class_handle_unhash(&exp->exp_handle);
@@ -1071,7 +1183,7 @@ void class_unlink_export(struct obd_export *exp)
 #endif /* HAVE_SERVER_SUPPORT */
 
        list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
-       list_del_init(&exp->exp_obd_chain_timed);
+       obd_export_timed_del(exp);
        exp->exp_obd->obd_num_exports--;
        spin_unlock(&exp->exp_obd->obd_dev_lock);
 
index 9ab855c..9a047d5 100644 (file)
@@ -640,9 +640,6 @@ int class_attach(struct lustre_cfg *lcfg)
        }
 
        obd->obd_self_export = exp;
-       spin_lock(&exp->exp_lock);
-       exp->exp_not_timed = 1;
-       spin_unlock(&exp->exp_lock);
        class_export_put(exp);
 
        rc = class_register_device(obd);
index 1006526..832731b 100644 (file)
@@ -110,7 +110,6 @@ LUSTRE_STATIC_UINT_ATTR(at_unhealthy_factor, &at_unhealthy_factor);
 LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header);
 LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
 LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
-LUSTRE_STATIC_UINT_ATTR(evict_multiplier, &ping_evict_timeout_multiplier);
 
 #ifdef HAVE_SERVER_SUPPORT
 LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
@@ -516,7 +515,6 @@ static struct attribute *lustre_attrs[] = {
        &lustre_attr_enable_fname_encoding.attr,
        &lustre_sattr_lbug_on_eviction.u.attr,
        &lustre_sattr_ping_interval.u.attr,
-       &lustre_sattr_evict_multiplier.u.attr,
        NULL,
 };
 
index 4760b58..c0710e7 100644 (file)
@@ -2475,13 +2475,6 @@ static int echo_client_setup(const struct lu_env *env,
        ocd->ocd_group = FID_SEQ_ECHO;
 
        rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
-       if (rc == 0) {
-               /* Turn off pinger because it connects to tgt obd directly. */
-               spin_lock(&ec->ec_exp->exp_lock);
-               ec->ec_exp->exp_not_timed = 1;
-               spin_unlock(&ec->ec_exp->exp_lock);
-       }
-
        OBD_FREE(ocd, sizeof(*ocd));
 
        if (rc != 0) {
index 3fe0b4f..a321643 100644 (file)
@@ -235,14 +235,13 @@ static int ofd_parse_connect_data(const struct lu_env *env,
 
        data->ocd_version = LUSTRE_VERSION_CODE;
 
-       if (OCD_HAS_FLAG(data, PINGLESS)) {
-               if (ptlrpc_pinger_suppress_pings()) {
-                       spin_lock(&exp->exp_lock);
-                       exp->exp_not_timed = 1;
-                       spin_unlock(&exp->exp_lock);
-               } else {
-                       data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
-               }
+       if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+               data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+       if (!OCD_HAS_FLAG(data, PINGLESS)) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_timed = 1;
+               spin_unlock(&exp->exp_lock);
        }
 
        if (!ofd->ofd_lut.lut_dt_conf.ddp_has_lseek_data_hole)
index a04b889..c758d58 100644 (file)
@@ -445,7 +445,7 @@ static int ping_evictor_main(void *arg)
 {
        struct obd_device *obd;
        struct obd_export *exp;
-       time64_t expire_time;
+       time64_t current_time;
        struct lu_env env;
        int rc;
 
@@ -485,10 +485,9 @@ static int ping_evictor_main(void *arg)
                        CFS_FAIL_TIMEOUT(OBD_FAIL_OBD_PAUSE_EVICTOR,
                                         PING_INTERVAL + PING_EVICT_TIMEOUT);
 
-               expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
+               current_time = ktime_get_real_seconds();
 
-               CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
-                      obd->obd_name, expire_time);
+               CDEBUG(D_HA, "evicting all exports of obd %s\n", obd->obd_name);
 
                /*
                 * Exports can't be deleted out of the list while we hold
@@ -497,24 +496,21 @@ static int ping_evictor_main(void *arg)
                 * removed from the list, we won't find them here.
                 */
                spin_lock(&obd->obd_dev_lock);
-               while (!list_empty(&obd->obd_exports_timed)) {
-                       exp = list_first_entry(&obd->obd_exports_timed,
-                                              struct obd_export,
-                                              exp_obd_chain_timed);
-                       if (expire_time > exp->exp_last_request_time) {
+               while((exp = obd_export_timed_get(obd, false))) {
+                       if (current_time > exp->exp_deadline) {
                                struct obd_uuid *client_uuid;
 
                                class_export_get(exp);
                                client_uuid = &exp->exp_client_uuid;
                                spin_unlock(&obd->obd_dev_lock);
-                               LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n",
+                               LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld deadline %lld last %lld\n",
                                              obd->obd_name,
                                              obd_uuid2str(client_uuid),
                                              obd_export_nid2str(exp),
                                              ktime_get_real_seconds() -
                                              exp->exp_last_request_time,
-                                             exp, ktime_get_real_seconds(),
-                                             expire_time,
+                                             exp, current_time,
+                                             exp->exp_deadline,
                                              exp->exp_last_request_time);
                                CDEBUG(D_HA, "Last request was at %lld\n",
                                       exp->exp_last_request_time);
index db55471..89bae0d 100644 (file)
@@ -1095,34 +1095,158 @@ static void ptlrpc_server_finish_active_request(
 }
 
 /**
+ * Calculate an export eviction timeout.
+ * Used for both cases, lock prolong timeout and ping evictor timeout.
+ *
+ * Whereas a problem client may be still alive trying hard to reconnect and to
+ * resend its RPCs, we should not consider the worst ever case, consisting of
+ * a chain of failures on each step. Let this timeout survive a recovery of
+ * just 1 failure, but let this be the worst possible one - a dead server NID:
+ *
+ * - an RPC timeout;
+ * - the first re-connect is sent to the same NID and times out;
+ * - the second re-connect to the failover pair returns an error;
+ * - the third re-connect to the original node to a different NID succeeds;
+ * - the RPC resend succeeds;
+ *
+ * For lock prolong timeout, we are in the middle of the process -
+ * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
+ * and at least 1 another RPC (which will trigger another refresh if it will be
+ * not CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
+ * from the RPC (i.e. the view of the client on the current AT) is taken into
+ * account.
+ *
+ * \param[in] at             AT of RPC service time to calculate timeout for
+ * \param[in] net_at         network AT
+ * \param[in] rpc_left_time   left service time for the current RPC
+ *                            0 if not applicable
+ * \param[in] pinger         if the caller is ping evictor or ldlm
+ *
+ * \retval             timeout in seconds to wait for the next client's RPC
+ */
+static timeout_t ptlrpc_export_timeout(struct obd_device *obd,
+                                      struct adaptive_timeout *at,
+                                      timeout_t netl,
+                                      timeout_t rpc_left_time,
+                                      bool pinger)
+{
+       timeout_t timeout, at_timeout, req_timeout;
+
+       if (obd_at_off(obd))
+               return obd_timeout / 2;
+
+       if (pinger) {
+               /* There might be a delay till the next RPC. In fact it is two
+                * PING_INTERVALs due to ptlrpc_pinger_main logic. */
+               timeout = 2 * PING_INTERVAL;
+       } else {
+               /* For the lock prolong, we have an RPC in hand, which may still
+                * get its reply lost. Therefore, it may be either this one or
+                * the next client's RPC times out, take the max.
+                * Considering the current RPC, take just the left time. */
+               LASSERT(at != NULL);
+               at_timeout = at_est2timeout(obd_at_get(obd, at)) + netl;
+               req_timeout = max(rpc_left_time + netl, at_timeout);
+               /* Adding the RPC resend time - not needed in the ping evictor
+                * case, export is updated on re-connect  */
+               timeout = req_timeout + at_timeout;
+       }
+
+       /* Adding the re-connect time: 1st re-connect timeout,
+        * 2nd reconnect error, 3rd reconnect success. */
+       timeout += 3 * (INITIAL_CONNECT_TIMEOUT + netl);
+
+       /* Let's be a bit more conservative than client */
+       return max(timeout + (timeout >> 4),
+                  (timeout_t)obd_get_ldlm_enqueue_min(obd));
+}
+
+/**
+ * Used for lock prolog timeout, calculates a timeout for CANCEL to come.
+ * Also used for recovery, calculates a timeout for a next recovery RPC to come.
+ * In this case, there is an RPC, in hand. Thus, a particular svcpt AT is used.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+                                       bool recovery)
+{
+       timeout_t netl;
+
+       if (recovery)
+               netl = lustre_msg_get_service_timeout(req->rq_reqmsg);
+       else
+               netl = obd_at_get(req->rq_export->exp_obd,
+                                 &req->rq_export->exp_imp_reverse->
+                                 imp_at.iat_net_latency);
+
+       return ptlrpc_export_timeout(req->rq_export->exp_obd,
+                                    &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+                                    netl, req->rq_deadline -
+                                    ktime_get_real_seconds(), false);
+}
+
+/**
+ * Used for ping evictor, calculates a timeout for any next RPC to come.
+ * As there are different portals and the AT stats is separated for them,
+ * just the last RPC AT is used here.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+static timeout_t ptlrpc_export_pinger_timeout(struct ptlrpc_request *req)
+{
+       struct obd_import *revimp = req->rq_export->exp_imp_reverse;
+       timeout_t netl = obd_at_get(req->rq_export->exp_obd,
+                                   &revimp->imp_at.iat_net_latency);
+
+       return ptlrpc_export_timeout(req->rq_export->exp_obd,
+                                    &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+                                    netl, 0, true);
+}
+
+/**
+ * In case the net was down and just came back, when the 1st timeout has been
+ * already expired, clients just keep sending re-connects. Applying the same
+ * formula as in ptlrpc_export_timeout() to this case we get:
+ * - a previous reconnect to not yet recovered network, times out;
+ * - the second reconnect to the failover pair, ENODEV;
+ * - the third reconnect succeeds;
+ */
+static timeout_t ptlrpc_export_extra_timeout(struct obd_export *exp)
+{
+       timeout_t netl;
+
+       /* As this is not the 1st re-connection failure, the client might
+        * have net latency get extended to the max - CONNECTION_SWITCH_MAX */
+       netl = obd_at_get(exp->exp_obd,
+                         &exp->exp_imp_reverse->imp_at.iat_net_latency);
+       return 3 * INITIAL_CONNECT_TIMEOUT + CONNECTION_SWITCH_MAX + 2 * netl;
+}
+
+/**
  * This function makes sure dead exports are evicted in a timely manner.
  * This function is only called when some export receives a message (i.e.,
  * the network is up.)
  */
-void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
+void ptlrpc_update_export_timer(struct ptlrpc_request *req)
 {
-       struct obd_export *oldest_exp, *newest_exp;
-       time64_t oldest_time, current_time;
-       bool    evict = false;
+       struct obd_export *oldest_exp, *newest_exp, *exp;
+       time64_t current_time, timeout;
+       bool evict = false;
+       void *data;
+       int rc;
        ENTRY;
 
-       LASSERT(exp);
-
-       /*
-        * Compensate for slow machines, etc, by faking our request time
-        * into the future.  Although this can break the strict time-ordering
-        * of the list, we can be really lazy here - we don't have to evict
-        * at the exact right moment.  Eventually, all silent exports
-        * will make it to the top of the list.
-        */
+       LASSERT(req != NULL);
+       LASSERT(req->rq_export != NULL);
 
-       /* Do not pay attention on 1sec or smaller renewals. */
+       exp = req->rq_export;
        current_time = ktime_get_real_seconds();
-       /* 1 seconds */
-       if (exp->exp_last_request_time + 1 >= current_time + extra_delay)
-               RETURN_EXIT;
 
-       exp->exp_last_request_time = current_time + extra_delay;
+       rc = obd_export_timed_init(exp, &data);
+       if (rc)
+               /* will be updated next time */
+               RETURN_EXIT;
 
        /*
         * exports may get disconnected from the chain even though the
@@ -1130,54 +1254,60 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
         * manipulating the lists
         */
        spin_lock(&exp->exp_obd->obd_dev_lock);
-
-       if (list_empty(&exp->exp_obd_chain_timed)) {
+       if (list_empty(&exp->exp_timed_chain)) {
                /* this one is not timed */
                spin_unlock(&exp->exp_obd->obd_dev_lock);
-               RETURN_EXIT;
+               GOTO(err, 0);
        }
 
-       newest_exp = list_last_entry(&exp->exp_obd->obd_exports_timed,
-                                    struct obd_export, exp_obd_chain_timed);
+       exp->exp_last_request_time = current_time;
 
-       list_move_tail(&exp->exp_obd_chain_timed,
-                      &exp->exp_obd->obd_exports_timed);
+       timeout = ptlrpc_export_pinger_timeout(req);
 
-       if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
-               /* be nice to everyone during recovery */
+       /* Do not pay attention on 1sec or smaller renewals. */
+       if (exp->exp_deadline + 1 >= current_time + timeout) {
                spin_unlock(&exp->exp_obd->obd_dev_lock);
-               RETURN_EXIT;
+               GOTO(err, 0);
        }
 
-       oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
-                               struct obd_export, exp_obd_chain_timed);
+       newest_exp = obd_export_timed_get(exp->exp_obd, true);
+       obd_export_timed_del(exp);
+       exp->exp_deadline = current_time + timeout;
+       obd_export_timed_add(exp, &data);
 
-       oldest_time = oldest_exp->exp_last_request_time;
+       if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
+               /* be nice to everyone during recovery */
+               spin_unlock(&exp->exp_obd->obd_dev_lock);
+               GOTO(err, 0);
+       }
+       oldest_exp = obd_export_timed_get(exp->exp_obd, false);
 
        /* Check if the oldest entry is expired. */
-       if (exp->exp_obd->obd_eviction_timer == 0 &&
-           current_time > oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
-
-               if (current_time < newest_exp->exp_last_request_time +
-                            PING_EVICT_TIMEOUT / 2) {
-                       /* If import is active - evict stale clients */
-                       evict = true;
-               } else {
-                       /*
-                        * We need a second timer, in case the net was down and
-                        * it just came back. Since the pinger may skip every
-                        * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-                        * we better wait for 3.
-                        */
-                       exp->exp_obd->obd_eviction_timer =
-                               ktime_get_real_seconds() + 3 * PING_INTERVAL;
-                       CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
-                              exp->exp_obd->obd_name,
-                              obd_export_nid2str(oldest_exp), oldest_time);
-
+       if (exp->exp_obd->obd_eviction_timer == 0) {
+               if (current_time > oldest_exp->exp_deadline) {
+                       timeout = newest_exp->exp_last_request_time +
+                               ((newest_exp->exp_deadline -
+                                 newest_exp->exp_last_request_time) >> 1);
+                       if (current_time < timeout) {
+                               /* If import is active - evict stale clients */
+                               evict = true;
+                       } else {
+                               /*
+                                * We need a second timer, in case the net was
+                                * down and it just came back.
+                                */
+                               exp->exp_obd->obd_eviction_timer =
+                                       ktime_get_real_seconds() +
+                                       ptlrpc_export_extra_timeout(oldest_exp);
+                               CDEBUG(D_HA, "%s: Think about evicting %s "
+                                      "from %lld deadline at %lld\n",
+                                      exp->exp_obd->obd_name,
+                                      obd_export_nid2str(oldest_exp),
+                                      oldest_exp->exp_deadline,
+                                      exp->exp_obd->obd_eviction_timer);
+                       }
                }
        }
-
        spin_unlock(&exp->exp_obd->obd_dev_lock);
 
        if (evict) {
@@ -1185,7 +1315,7 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
                ping_evictor_wake(exp);
        } else {
                if (ktime_get_real_seconds() >
-                   (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                   exp->exp_obd->obd_eviction_timer) {
                        /*
                         * The evictor won't evict anyone who we've heard from
                         * recently, so we don't have to check before we start
@@ -1197,6 +1327,8 @@ void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
        }
 
        EXIT;
+err:
+       obd_export_timed_fini(exp, &data);
 }
 
 /**
@@ -2171,7 +2303,8 @@ static int ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
 
                if (rc)
                        goto err_req;
-               ptlrpc_update_export_timer(req->rq_export, 0);
+
+               ptlrpc_update_export_timer(req);
        }
 
        /* req_in handling should/must be fast */
@@ -2306,9 +2439,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
        if (likely(request->rq_export)) {
                if (unlikely(ptlrpc_check_req(request)))
                        goto put_conn;
-               ptlrpc_update_export_timer(request->rq_export,
-                                          div_u64(timediff_usecs,
-                                                  USEC_PER_SEC / 2));
+
+               ptlrpc_update_export_timer(request);
        }
 
        /*
index 4c8f61d..e580307 100755 (executable)
@@ -1134,23 +1134,26 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
                lctl get_param -n mdt.${mds1_svc}.num_exports)
        local ost_nexp=$(do_facet ost1 \
                lctl get_param -n obdfilter.${ost1_svc}.num_exports)
+       # must be equal on all the nodes
+       local INTERVAL=$(do_facet $SINGLEMDS lctl get_param -n ping_interval)
+       local AT_MAX_SAVED=$(at_max_get mds1)
+
+       at_max_set $TIMEOUT mds1
+       at_max_set $TIMEOUT ost1
+       stack_trap "at_max_set $AT_MAX_SAVED mds1" EXIT
+       stack_trap "at_max_set $AT_MAX_SAVED ost1" EXIT
 
        echo "starting with '$ost_nexp' OST and '$mds_nexp' MDS exports"
 
        zconf_umount $HOSTNAME $MOUNT2 -f
 
-       # PING_INTERVAL max(obd_timeout / 4, 1U)
-       # PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
-
-       # evictor takes PING_EVICT_TIMEOUT to evict.
-       # But if there's a race to start the evictor from various obds,
-       # the loser might have to wait for the next ping.
-       # = 6 * PING_INTERVAL + PING_INTERVAL
-       # = 7 PING_INTERVAL = 7 obd_timeout / 4 =  (1+3/4)obd_timeout
-       # let's wait $((TIMEOUT * 2)) # bug 19887
-       wait_client_evicted ost1 $ost_nexp $((TIMEOUT * 2)) ||
+       # see ptlrpc_export_timeout() for the pinger case; take a bit more the test sake
+       local TOUT=$((INTERVAL * 2 + (TIMEOUT / 20 + 5 + TIMEOUT) * 3))
+       TOUT=$((TOUT + (TOUT >> 3)))
+       echo i $INTERVAL m $AT_MAX_SAVED t $TIMEOUT $TOUT
+       wait_client_evicted ost1 $ost_nexp $TOUT ||
                error "Client was not evicted by OSS"
-       wait_client_evicted mds1 $mds_nexp $((TIMEOUT * 2)) ||
+       wait_client_evicted mds1 $mds_nexp $TOUT ||
                error "Client was not evicted by MDS"
 }
 run_test 26b "evict dead exports"