void ldlm_revoke_export_locks(struct obd_export *exp);
timeout_t ldlm_bl_timeout(struct ldlm_lock *lock);
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req);
#endif
int ldlm_del_waiting_lock(struct ldlm_lock *lock);
int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
* order
* protected by obd_dev_lock
*/
- struct list_head exp_obd_chain_timed;
+ struct list_head exp_timed_chain;
/** Obd device of this export */
struct obd_device *exp_obd;
/**
__u64 exp_last_committed;
/** When was last request received */
time64_t exp_last_request_time;
+ time64_t exp_deadline;
/** On replay all requests waiting for replay are linked here */
struct list_head exp_req_replay_queue;
/**
*/
exp_old_falloc:1,
exp_hashed:1,
- exp_not_timed:1;
+ exp_timed:1;
/* also protected by exp_lock */
enum lustre_sec_part exp_sp_peer;
struct sptlrpc_flavor exp_flvr; /* current */
void ptlrpc_server_drop_request(struct ptlrpc_request *req);
void ptlrpc_request_change_export(struct ptlrpc_request *req,
struct obd_export *export);
-void ptlrpc_update_export_timer(struct obd_export *exp,
- time64_t extra_delay);
+void ptlrpc_update_export_timer(struct ptlrpc_request *req);
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+ bool recovery);
int ptlrpc_hr_init(void);
void ptlrpc_hr_fini(void);
struct obd_export *obd_self_export;
struct obd_export *obd_lwp_export;
/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
- struct list_head obd_exports_timed;
+ struct rb_root obd_exports_timed;
time64_t obd_eviction_timer; /* for ping evictor */
atomic_t obd_max_recoverable_clients;
struct obd_import *class_new_import(struct obd_device *obd);
void class_destroy_import(struct obd_import *exp);
+int obd_export_timed_init(struct obd_export *exp, void **data);
+void obd_export_timed_fini(struct obd_export *exp, void **data);
+void obd_export_timed_add(struct obd_export *exp, void **data);
+void obd_export_timed_del(struct obd_export *exp);
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last);
+
#ifdef HAVE_SERVER_SUPPORT
struct obd_type *class_search_type(const char *name);
struct obd_type *class_get_type(const char *name);
extern unsigned int obd_timeout; /* seconds */
extern unsigned int ldlm_timeout; /* seconds */
extern unsigned int ping_interval; /* seconds */
-extern unsigned int ping_evict_timeout_multiplier;
extern unsigned int obd_timeout_set;
extern unsigned int ldlm_timeout_set;
extern unsigned int bulk_timeout;
* and there's no urgent need to evict a client just because it's idle, we
* should be very conservative here.
*/
-#define PING_EVICT_TIMEOUT (PING_INTERVAL * ping_evict_timeout_multiplier)
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
/* Max connect interval for nonresponsive servers; ~50s to avoid building up
/* OK. this is a possible lock the user holds doing I/O
* let's refresh eviction timer for it.
*/
- timeout = ldlm_bl_timeout_by_rpc(arg->lpa_req);
+ timeout = ptlrpc_export_prolong_timeout(arg->lpa_req, false);
LDLM_DEBUG(lock, "refreshed to %ds. ", timeout);
ldlm_refresh_waiting_lock(lock, timeout);
}
{
struct obd_device *obd = export->exp_obd;
struct obd_import *revimp;
+ int rc = 0;
LASSERT(export->exp_imp_reverse == NULL);
spin_unlock(&export->exp_lock);
class_import_put(revimp);
- if (!export->exp_not_timed) {
- spin_lock(&obd->obd_dev_lock);
- list_add_tail(&export->exp_obd_chain_timed,
- &obd->obd_exports_timed);
- spin_unlock(&obd->obd_dev_lock);
+ if (export->exp_timed) {
+ void *data;
+
+ rc = obd_export_timed_init(export, &data);
+ if (rc == 0) {
+ spin_lock(&obd->obd_dev_lock);
+ /* At the beginning, there is no AT stats yet, use
+ * previous approach for the ping evictor timeout */
+ export->exp_deadline =
+ PING_EVICT_TIMEOUT + ktime_get_real_seconds();
+ obd_export_timed_add(export, &data);
+ spin_unlock(&obd->obd_dev_lock);
+ obd_export_timed_fini(export, &data);
+ }
}
- return 0;
+ return rc;
}
EXPORT_SYMBOL(rev_import_init);
* should be called to cleanup stuff
*/
spin_lock(&target->obd_dev_lock);
- list_del_init(&export->exp_obd_chain_timed);
+ obd_export_timed_del(export);
spin_unlock(&target->obd_dev_lock);
class_export_get(export);
* Add request @timeout to the recovery time so next request from
* this client may come in recovery time
*/
- if (!obd_at_off(obd)) {
- struct ptlrpc_service_part *svcpt;
- timeout_t est_timeout;
-
- svcpt = req->rq_rqbd->rqbd_svcpt;
- /*
- * If the server sent early reply for this request,
- * the client will recalculate the timeout according to
- * current server estimate service time, so we will
- * use the maxium timeout here for waiting the client
- * sending the next req
- */
- est_timeout = obd_at_get(obd, &svcpt->scp_at_estimate);
- timeout = max_t(timeout_t, at_est2timeout(est_timeout),
- lustre_msg_get_timeout(req->rq_reqmsg));
- /*
- * Add 2 net_latency, one for balance rq_deadline
- * (see ptl_send_rpc), one for resend the req to server,
- * Note: client will pack net_latency in replay req
- * (see ptlrpc_replay_req)
- */
- timeout += 2 * lustre_msg_get_service_timeout(req->rq_reqmsg);
- }
+ if (!obd_at_off(obd))
+ timeout = ptlrpc_export_prolong_timeout(req, true);
extend_recovery_timer(class_exp2obd(req->rq_export), timeout,
true);
}
* so we need refresh the last_request_time, to avoid the
* export is being evicted
*/
- ptlrpc_update_export_timer(req->rq_export, 0);
+ ptlrpc_update_export_timer(req);
}
/*
EXPORT_SYMBOL(ldlm_bl_timeout);
/**
- * Calculate the per-export Blocking timeout by the given RPC (covering the
- * reply to this RPC and the next RPC). The next RPC could be still not CANCEL,
- * but having the lock refresh mechanism it is enough.
- *
- * Used for lock refresh timeout when we are in the middle of the process -
- * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
- * and at least 1 RPC (which will trigger another refresh if it will be not
- * CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
- * from the RPC (i.e. the view of the client on the current AT) is taken into
- * account.
- *
- * \param[in] req req which export needs the timeout calculation
- *
- * \retval timeout in seconds to wait for the next client's RPC
- */
-timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req)
-{
- struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
- timeout_t timeout, req_timeout, at_timeout, netl;
- struct obd_device *obd = req->rq_export->exp_obd;
-
- if (obd_at_off(obd))
- return obd_timeout / 2;
-
- /* A blocked lock means somebody in the cluster is waiting, and we
- * should not consider the worst ever case, consisting of a chain of
- * failures on each step, however this timeout should survive a
- * recovery of at least 1 failure, let this one to be the worst one:
- * in case a server NID is dead first re-connect is done through the
- * same router and also times out.
- *
- * Either this on the next RPC times out, take the max.
- * Considering the current RPC, take just the left time.
- */
- netl = obd_at_get(obd,
- &req->rq_export->exp_imp_reverse->imp_at.iat_net_latency);
- req_timeout = req->rq_deadline - ktime_get_real_seconds() + netl;
- at_timeout = at_est2timeout(obd_at_get(obd, &svcpt->scp_at_estimate))
- + netl;
- req_timeout = max(req_timeout, at_timeout);
-
- /* Take 1 re-connect failure and 1 re-connect success into account. */
- timeout = at_timeout + INITIAL_CONNECT_TIMEOUT + netl + req_timeout;
-
- /* Client's timeout is calculated as at_est2timeout(), let's be a bit
- * more conservative than client
- */
- return max(timeout + (timeout >> 4),
- (timeout_t)obd_get_ldlm_enqueue_min(obd));
-}
-EXPORT_SYMBOL(ldlm_bl_timeout_by_rpc);
-
-/**
* Perform lock cleanup if AST sending failed.
*/
static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
return -EBADE;
}
- if (OCD_HAS_FLAG(data, PINGLESS)) {
- if (ptlrpc_pinger_suppress_pings()) {
- spin_lock(&exp->exp_lock);
- exp->exp_not_timed = 1;
- spin_unlock(&exp->exp_lock);
- } else {
- data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
- }
+ if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+ data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+ /* Because we do not want this export to be evicted by pinger,
+ * let's not add this export to the timed chain list. */
+ if (!OCD_HAS_FLAG(data, PINGLESS) &&
+ !(data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_timed = 1;
+ spin_unlock(&exp->exp_lock);
}
data->ocd_max_easize = mdt->mdt_max_ea_size;
*exp = NULL;
} else {
*exp = lexp;
- /* Because we do not want this export to be evicted by pinger,
- * let's not add this export to the timed chain list.
- */
- if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
- spin_lock(&lexp->exp_lock);
- lexp->exp_not_timed = 1;
- spin_unlock(&lexp->exp_lock);
- }
}
RETURN(rc);
else
nodemap_del_member(exp);
- if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
- spin_lock(&exp->exp_lock);
- exp->exp_not_timed = 1;
- spin_unlock(&exp->exp_lock);
- }
-
RETURN(rc);
}
unsigned int ping_interval = (OBD_TIMEOUT_DEFAULT > 4) ?
(OBD_TIMEOUT_DEFAULT / 4) : 1;
EXPORT_SYMBOL(ping_interval);
-unsigned int ping_evict_timeout_multiplier = 6;
-EXPORT_SYMBOL(ping_evict_timeout_multiplier);
unsigned int obd_timeout_set;
EXPORT_SYMBOL(obd_timeout_set);
unsigned int ldlm_timeout_set;
newdev->obd_grant_check_threshold = 100;
INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
INIT_LIST_HEAD(&newdev->obd_delayed_exports);
- INIT_LIST_HEAD(&newdev->obd_exports_timed);
+ newdev->obd_exports_timed.rb_node = NULL;
INIT_LIST_HEAD(&newdev->obd_nid_stats);
spin_lock_init(&newdev->obd_nid_lock);
spin_lock_init(&newdev->obd_dev_lock);
spin_lock_init(&export->exp_bl_list_lock);
INIT_LIST_HEAD(&export->exp_bl_list);
INIT_LIST_HEAD(&export->exp_stale_list);
- INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+ INIT_LIST_HEAD(&export->exp_timed_chain);
INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull);
export->exp_sp_peer = LUSTRE_SP_ANY;
return __class_new_export(obd, uuid, true);
}
+struct rb_node_exp_deadline {
+ struct rb_node ned_node;
+ struct list_head ned_head;
+ time64_t ned_deadline;
+};
+
+static inline bool ptlrpc_exp_deadline_less(struct rb_node *ln,
+ const struct rb_node *rn)
+{
+ struct rb_node_exp_deadline *left, *right;
+
+ left = rb_entry(ln, struct rb_node_exp_deadline, ned_node);
+ right = rb_entry(rn, struct rb_node_exp_deadline, ned_node);
+
+ return left->ned_deadline < right->ned_deadline;
+}
+
+static inline int ptlrpc_exp_deadline_cmp(const void *key,
+ const struct rb_node *node)
+{
+ struct rb_node_exp_deadline *ned;
+ time64_t *time = (time64_t *)key;
+
+ ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+ return (*time < ned->ned_deadline ? -1 :
+ *time > ned->ned_deadline ? 1 : 0);
+}
+
+int obd_export_timed_init(struct obd_export *exp, void **data)
+
+{
+ OBD_ALLOC(*data, sizeof(struct rb_node_exp_deadline));
+ return data == NULL ? -ENOMEM : 0;
+}
+EXPORT_SYMBOL(obd_export_timed_init);
+
+void obd_export_timed_fini(struct obd_export *exp, void **data)
+{
+ if (*data) {
+ OBD_FREE(*data, sizeof(struct rb_node_exp_deadline));
+ *data = NULL;
+ }
+}
+EXPORT_SYMBOL(obd_export_timed_fini);
+
+void obd_export_timed_add(struct obd_export *exp, void **data)
+{
+ struct rb_node_exp_deadline *ned = *data;
+ struct rb_node *node;
+
+ node = rb_find(&exp->exp_deadline, &exp->exp_obd->obd_exports_timed,
+ ptlrpc_exp_deadline_cmp);
+
+ if (node == NULL) {
+ LASSERT(ned != NULL);
+ INIT_LIST_HEAD(&ned->ned_head);
+ RB_CLEAR_NODE(&ned->ned_node);
+ ned->ned_deadline = exp->exp_deadline;
+ *data = NULL;
+
+ rb_add(&ned->ned_node, &exp->exp_obd->obd_exports_timed,
+ ptlrpc_exp_deadline_less);
+ } else {
+ ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+ LASSERT(!list_empty(&ned->ned_head));
+ }
+
+ list_add_tail(&exp->exp_timed_chain, &ned->ned_head);
+}
+EXPORT_SYMBOL(obd_export_timed_add);
+
+void obd_export_timed_del(struct obd_export *exp)
+{
+ struct rb_node_exp_deadline *ned;
+
+ if (list_empty(&exp->exp_timed_chain))
+ return;
+
+ ned = rb_entry(rb_find(&exp->exp_deadline,
+ &exp->exp_obd->obd_exports_timed,
+ ptlrpc_exp_deadline_cmp),
+ struct rb_node_exp_deadline, ned_node);
+ LASSERT(!list_empty(&ned->ned_head));
+ LASSERT(ned->ned_deadline == exp->exp_deadline);
+ list_del_init(&exp->exp_timed_chain);
+
+ if (list_empty(&ned->ned_head)) {
+ rb_erase(&ned->ned_node, &exp->exp_obd->obd_exports_timed);
+ OBD_FREE_PTR(ned);
+ }
+}
+EXPORT_SYMBOL(obd_export_timed_del);
+
+struct obd_export *obd_export_timed_get(struct obd_device *obd, bool last)
+{
+ struct rb_node_exp_deadline *ned;
+ struct rb_node *node;
+
+ node = last ? rb_last(&obd->obd_exports_timed) :
+ rb_first(&obd->obd_exports_timed);
+
+ if (node == NULL)
+ return NULL;
+
+ ned = rb_entry(node, struct rb_node_exp_deadline, ned_node);
+ LASSERT(!list_empty(&ned->ned_head));
+
+ return list_first_entry(&ned->ned_head, struct obd_export,
+ exp_timed_chain);
+}
+EXPORT_SYMBOL(obd_export_timed_get);
+
void class_unlink_export(struct obd_export *exp)
{
class_handle_unhash(&exp->exp_handle);
#endif /* HAVE_SERVER_SUPPORT */
list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
- list_del_init(&exp->exp_obd_chain_timed);
+ obd_export_timed_del(exp);
exp->exp_obd->obd_num_exports--;
spin_unlock(&exp->exp_obd->obd_dev_lock);
}
obd->obd_self_export = exp;
- spin_lock(&exp->exp_lock);
- exp->exp_not_timed = 1;
- spin_unlock(&exp->exp_lock);
class_export_put(exp);
rc = class_register_device(obd);
LUSTRE_STATIC_UINT_ATTR(enable_stats_header, &obd_enable_stats_header);
LUSTRE_STATIC_UINT_ATTR(lbug_on_eviction, &obd_lbug_on_eviction);
LUSTRE_STATIC_UINT_ATTR(ping_interval, &ping_interval);
-LUSTRE_STATIC_UINT_ATTR(evict_multiplier, &ping_evict_timeout_multiplier);
#ifdef HAVE_SERVER_SUPPORT
LUSTRE_STATIC_UINT_ATTR(ldlm_timeout, &ldlm_timeout);
&lustre_attr_enable_fname_encoding.attr,
&lustre_sattr_lbug_on_eviction.u.attr,
&lustre_sattr_ping_interval.u.attr,
- &lustre_sattr_evict_multiplier.u.attr,
NULL,
};
ocd->ocd_group = FID_SEQ_ECHO;
rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
- if (rc == 0) {
- /* Turn off pinger because it connects to tgt obd directly. */
- spin_lock(&ec->ec_exp->exp_lock);
- ec->ec_exp->exp_not_timed = 1;
- spin_unlock(&ec->ec_exp->exp_lock);
- }
-
OBD_FREE(ocd, sizeof(*ocd));
if (rc != 0) {
data->ocd_version = LUSTRE_VERSION_CODE;
- if (OCD_HAS_FLAG(data, PINGLESS)) {
- if (ptlrpc_pinger_suppress_pings()) {
- spin_lock(&exp->exp_lock);
- exp->exp_not_timed = 1;
- spin_unlock(&exp->exp_lock);
- } else {
- data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
- }
+ if (OCD_HAS_FLAG(data, PINGLESS) && !ptlrpc_pinger_suppress_pings())
+ data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
+
+ if (!OCD_HAS_FLAG(data, PINGLESS)) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_timed = 1;
+ spin_unlock(&exp->exp_lock);
}
if (!ofd->ofd_lut.lut_dt_conf.ddp_has_lseek_data_hole)
{
struct obd_device *obd;
struct obd_export *exp;
- time64_t expire_time;
+ time64_t current_time;
struct lu_env env;
int rc;
CFS_FAIL_TIMEOUT(OBD_FAIL_OBD_PAUSE_EVICTOR,
PING_INTERVAL + PING_EVICT_TIMEOUT);
- expire_time = ktime_get_real_seconds() - PING_EVICT_TIMEOUT;
+ current_time = ktime_get_real_seconds();
- CDEBUG(D_HA, "evicting all exports of obd %s older than %lld\n",
- obd->obd_name, expire_time);
+ CDEBUG(D_HA, "evicting all exports of obd %s\n", obd->obd_name);
/*
* Exports can't be deleted out of the list while we hold
* removed from the list, we won't find them here.
*/
spin_lock(&obd->obd_dev_lock);
- while (!list_empty(&obd->obd_exports_timed)) {
- exp = list_first_entry(&obd->obd_exports_timed,
- struct obd_export,
- exp_obd_chain_timed);
- if (expire_time > exp->exp_last_request_time) {
+ while((exp = obd_export_timed_get(obd, false))) {
+ if (current_time > exp->exp_deadline) {
struct obd_uuid *client_uuid;
class_export_get(exp);
client_uuid = &exp->exp_client_uuid;
spin_unlock(&obd->obd_dev_lock);
- LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld expire %lld last %lld\n",
+ LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %lld seconds. I think it's dead, and I am evicting it. exp %p, cur %lld deadline %lld last %lld\n",
obd->obd_name,
obd_uuid2str(client_uuid),
obd_export_nid2str(exp),
ktime_get_real_seconds() -
exp->exp_last_request_time,
- exp, ktime_get_real_seconds(),
- expire_time,
+ exp, current_time,
+ exp->exp_deadline,
exp->exp_last_request_time);
CDEBUG(D_HA, "Last request was at %lld\n",
exp->exp_last_request_time);
}
/**
+ * Calculate an export eviction timeout.
+ * Used for both cases, lock prolong timeout and ping evictor timeout.
+ *
+ * Whereas a problem client may be still alive trying hard to reconnect and to
+ * resend its RPCs, we should not consider the worst ever case, consisting of
+ * a chain of failures on each step. Let this timeout survive a recovery of
+ * just 1 failure, but let this be the worst possible one - a dead server NID:
+ *
+ * - an RPC timeout;
+ * - the first re-connect is sent to the same NID and times out;
+ * - the second re-connect to the failover pair returns an error;
+ * - the third re-connect to the original node to a different NID succeeds;
+ * - the RPC resend succeeds;
+ *
+ * For lock prolong timeout, we are in the middle of the process -
+ * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
+ * and at least 1 another RPC (which will trigger another refresh if it will be
+ * not CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
+ * from the RPC (i.e. the view of the client on the current AT) is taken into
+ * account.
+ *
+ * \param[in] at AT of RPC service time to calculate timeout for
+ * \param[in] net_at network AT
+ * \param[in] rpc_left_time left service time for the current RPC
+ * 0 if not applicable
+ * \param[in] pinger if the caller is ping evictor or ldlm
+ *
+ * \retval timeout in seconds to wait for the next client's RPC
+ */
+static timeout_t ptlrpc_export_timeout(struct obd_device *obd,
+ struct adaptive_timeout *at,
+ timeout_t netl,
+ timeout_t rpc_left_time,
+ bool pinger)
+{
+ timeout_t timeout, at_timeout, req_timeout;
+
+ if (obd_at_off(obd))
+ return obd_timeout / 2;
+
+ if (pinger) {
+ /* There might be a delay till the next RPC. In fact it is two
+ * PING_INTERVALs due to ptlrpc_pinger_main logic. */
+ timeout = 2 * PING_INTERVAL;
+ } else {
+ /* For the lock prolong, we have an RPC in hand, which may still
+ * get its reply lost. Therefore, it may be either this one or
+ * the next client's RPC times out, take the max.
+ * Considering the current RPC, take just the left time. */
+ LASSERT(at != NULL);
+ at_timeout = at_est2timeout(obd_at_get(obd, at)) + netl;
+ req_timeout = max(rpc_left_time + netl, at_timeout);
+ /* Adding the RPC resend time - not needed in the ping evictor
+ * case, export is updated on re-connect */
+ timeout = req_timeout + at_timeout;
+ }
+
+ /* Adding the re-connect time: 1st re-connect timeout,
+ * 2nd reconnect error, 3rd reconnect success. */
+ timeout += 3 * (INITIAL_CONNECT_TIMEOUT + netl);
+
+ /* Let's be a bit more conservative than client */
+ return max(timeout + (timeout >> 4),
+ (timeout_t)obd_get_ldlm_enqueue_min(obd));
+}
+
+/**
+ * Used for lock prolog timeout, calculates a timeout for CANCEL to come.
+ * Also used for recovery, calculates a timeout for a next recovery RPC to come.
+ * In this case, there is an RPC, in hand. Thus, a particular svcpt AT is used.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+timeout_t ptlrpc_export_prolong_timeout(struct ptlrpc_request *req,
+ bool recovery)
+{
+ timeout_t netl;
+
+ if (recovery)
+ netl = lustre_msg_get_service_timeout(req->rq_reqmsg);
+ else
+ netl = obd_at_get(req->rq_export->exp_obd,
+ &req->rq_export->exp_imp_reverse->
+ imp_at.iat_net_latency);
+
+ return ptlrpc_export_timeout(req->rq_export->exp_obd,
+ &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+ netl, req->rq_deadline -
+ ktime_get_real_seconds(), false);
+}
+
+/**
+ * Used for ping evictor, calculates a timeout for any next RPC to come.
+ * As there are different portals and the AT stats is separated for them,
+ * just the last RPC AT is used here.
+ *
+ * The reverse import network AT is used as an estimate for the client side one.
+ */
+static timeout_t ptlrpc_export_pinger_timeout(struct ptlrpc_request *req)
+{
+ struct obd_import *revimp = req->rq_export->exp_imp_reverse;
+ timeout_t netl = obd_at_get(req->rq_export->exp_obd,
+ &revimp->imp_at.iat_net_latency);
+
+ return ptlrpc_export_timeout(req->rq_export->exp_obd,
+ &req->rq_rqbd->rqbd_svcpt->scp_at_estimate,
+ netl, 0, true);
+}
+
+/**
+ * In case the net was down and just came back, when the 1st timeout has been
+ * already expired, clients just keep sending re-connects. Applying the same
+ * formula as in ptlrpc_export_timeout() to this case we get:
+ * - a previous reconnect to not yet recovered network, times out;
+ * - the second reconnect to the failover pair, ENODEV;
+ * - the third reconnect succeeds;
+ */
+static timeout_t ptlrpc_export_extra_timeout(struct obd_export *exp)
+{
+ timeout_t netl;
+
+ /* As this is not the 1st re-connection failure, the client might
+ * have net latency get extended to the max - CONNECTION_SWITCH_MAX */
+ netl = obd_at_get(exp->exp_obd,
+ &exp->exp_imp_reverse->imp_at.iat_net_latency);
+ return 3 * INITIAL_CONNECT_TIMEOUT + CONNECTION_SWITCH_MAX + 2 * netl;
+}
+
+/**
* This function makes sure dead exports are evicted in a timely manner.
* This function is only called when some export receives a message (i.e.,
* the network is up.)
*/
-void ptlrpc_update_export_timer(struct obd_export *exp, time64_t extra_delay)
+void ptlrpc_update_export_timer(struct ptlrpc_request *req)
{
- struct obd_export *oldest_exp, *newest_exp;
- time64_t oldest_time, current_time;
- bool evict = false;
+ struct obd_export *oldest_exp, *newest_exp, *exp;
+ time64_t current_time, timeout;
+ bool evict = false;
+ void *data;
+ int rc;
ENTRY;
- LASSERT(exp);
-
- /*
- * Compensate for slow machines, etc, by faking our request time
- * into the future. Although this can break the strict time-ordering
- * of the list, we can be really lazy here - we don't have to evict
- * at the exact right moment. Eventually, all silent exports
- * will make it to the top of the list.
- */
+ LASSERT(req != NULL);
+ LASSERT(req->rq_export != NULL);
- /* Do not pay attention on 1sec or smaller renewals. */
+ exp = req->rq_export;
current_time = ktime_get_real_seconds();
- /* 1 seconds */
- if (exp->exp_last_request_time + 1 >= current_time + extra_delay)
- RETURN_EXIT;
- exp->exp_last_request_time = current_time + extra_delay;
+ rc = obd_export_timed_init(exp, &data);
+ if (rc)
+ /* will be updated next time */
+ RETURN_EXIT;
/*
* exports may get disconnected from the chain even though the
* manipulating the lists
*/
spin_lock(&exp->exp_obd->obd_dev_lock);
-
- if (list_empty(&exp->exp_obd_chain_timed)) {
+ if (list_empty(&exp->exp_timed_chain)) {
/* this one is not timed */
spin_unlock(&exp->exp_obd->obd_dev_lock);
- RETURN_EXIT;
+ GOTO(err, 0);
}
- newest_exp = list_last_entry(&exp->exp_obd->obd_exports_timed,
- struct obd_export, exp_obd_chain_timed);
+ exp->exp_last_request_time = current_time;
- list_move_tail(&exp->exp_obd_chain_timed,
- &exp->exp_obd->obd_exports_timed);
+ timeout = ptlrpc_export_pinger_timeout(req);
- if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
- /* be nice to everyone during recovery */
+ /* Do not pay attention on 1sec or smaller renewals. */
+ if (exp->exp_deadline + 1 >= current_time + timeout) {
spin_unlock(&exp->exp_obd->obd_dev_lock);
- RETURN_EXIT;
+ GOTO(err, 0);
}
- oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
- struct obd_export, exp_obd_chain_timed);
+ newest_exp = obd_export_timed_get(exp->exp_obd, true);
+ obd_export_timed_del(exp);
+ exp->exp_deadline = current_time + timeout;
+ obd_export_timed_add(exp, &data);
- oldest_time = oldest_exp->exp_last_request_time;
+ if (test_bit(OBDF_RECOVERING, exp->exp_obd->obd_flags)) {
+ /* be nice to everyone during recovery */
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ GOTO(err, 0);
+ }
+ oldest_exp = obd_export_timed_get(exp->exp_obd, false);
/* Check if the oldest entry is expired. */
- if (exp->exp_obd->obd_eviction_timer == 0 &&
- current_time > oldest_time + PING_EVICT_TIMEOUT + extra_delay) {
-
- if (current_time < newest_exp->exp_last_request_time +
- PING_EVICT_TIMEOUT / 2) {
- /* If import is active - evict stale clients */
- evict = true;
- } else {
- /*
- * We need a second timer, in case the net was down and
- * it just came back. Since the pinger may skip every
- * other PING_INTERVAL (see note in ptlrpc_pinger_main),
- * we better wait for 3.
- */
- exp->exp_obd->obd_eviction_timer =
- ktime_get_real_seconds() + 3 * PING_INTERVAL;
- CDEBUG(D_HA, "%s: Think about evicting %s from %lld\n",
- exp->exp_obd->obd_name,
- obd_export_nid2str(oldest_exp), oldest_time);
-
+ if (exp->exp_obd->obd_eviction_timer == 0) {
+ if (current_time > oldest_exp->exp_deadline) {
+ timeout = newest_exp->exp_last_request_time +
+ ((newest_exp->exp_deadline -
+ newest_exp->exp_last_request_time) >> 1);
+ if (current_time < timeout) {
+ /* If import is active - evict stale clients */
+ evict = true;
+ } else {
+ /*
+ * We need a second timer, in case the net was
+ * down and it just came back.
+ */
+ exp->exp_obd->obd_eviction_timer =
+ ktime_get_real_seconds() +
+ ptlrpc_export_extra_timeout(oldest_exp);
+ CDEBUG(D_HA, "%s: Think about evicting %s "
+ "from %lld deadline at %lld\n",
+ exp->exp_obd->obd_name,
+ obd_export_nid2str(oldest_exp),
+ oldest_exp->exp_deadline,
+ exp->exp_obd->obd_eviction_timer);
+ }
}
}
-
spin_unlock(&exp->exp_obd->obd_dev_lock);
if (evict) {
ping_evictor_wake(exp);
} else {
if (ktime_get_real_seconds() >
- (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+ exp->exp_obd->obd_eviction_timer) {
/*
* The evictor won't evict anyone who we've heard from
* recently, so we don't have to check before we start
}
EXIT;
+err:
+ obd_export_timed_fini(exp, &data);
}
/**
if (rc)
goto err_req;
- ptlrpc_update_export_timer(req->rq_export, 0);
+
+ ptlrpc_update_export_timer(req);
}
/* req_in handling should/must be fast */
if (likely(request->rq_export)) {
if (unlikely(ptlrpc_check_req(request)))
goto put_conn;
- ptlrpc_update_export_timer(request->rq_export,
- div_u64(timediff_usecs,
- USEC_PER_SEC / 2));
+
+ ptlrpc_update_export_timer(request);
}
/*
lctl get_param -n mdt.${mds1_svc}.num_exports)
local ost_nexp=$(do_facet ost1 \
lctl get_param -n obdfilter.${ost1_svc}.num_exports)
+ # must be equal on all the nodes
+ local INTERVAL=$(do_facet $SINGLEMDS lctl get_param -n ping_interval)
+ local AT_MAX_SAVED=$(at_max_get mds1)
+
+ at_max_set $TIMEOUT mds1
+ at_max_set $TIMEOUT ost1
+ stack_trap "at_max_set $AT_MAX_SAVED mds1" EXIT
+ stack_trap "at_max_set $AT_MAX_SAVED ost1" EXIT
echo "starting with '$ost_nexp' OST and '$mds_nexp' MDS exports"
zconf_umount $HOSTNAME $MOUNT2 -f
- # PING_INTERVAL max(obd_timeout / 4, 1U)
- # PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
-
- # evictor takes PING_EVICT_TIMEOUT to evict.
- # But if there's a race to start the evictor from various obds,
- # the loser might have to wait for the next ping.
- # = 6 * PING_INTERVAL + PING_INTERVAL
- # = 7 PING_INTERVAL = 7 obd_timeout / 4 = (1+3/4)obd_timeout
- # let's wait $((TIMEOUT * 2)) # bug 19887
- wait_client_evicted ost1 $ost_nexp $((TIMEOUT * 2)) ||
+ # see ptlrpc_export_timeout() for the pinger case; take a bit more the test sake
+ local TOUT=$((INTERVAL * 2 + (TIMEOUT / 20 + 5 + TIMEOUT) * 3))
+ TOUT=$((TOUT + (TOUT >> 3)))
+ echo i $INTERVAL m $AT_MAX_SAVED t $TIMEOUT $TOUT
+ wait_client_evicted ost1 $ost_nexp $TOUT ||
error "Client was not evicted by OSS"
- wait_client_evicted mds1 $mds_nexp $((TIMEOUT * 2)) ||
+ wait_client_evicted mds1 $mds_nexp $TOUT ||
error "Client was not evicted by MDS"
}
run_test 26b "evict dead exports"