DEBUG_REQ(D_INFO, req, "pinging %s->%s",
imp->imp_obd->obd_uuid.uuid,
obd2cli_tgt(imp->imp_obd));
+
+ /* To quickly detect server failure ping timeouts must be
+ * kept small. Therefore we must override/ignore the server
+ * rpc completion estimate which may be very large since
+ * it includes non-ping service times. The right long term
+ * fix will be to add a per-server (not per-service) thread
+ * in order to reduce the number of pings in the system in
+ * general (see bug 12471). */
+ if (!AT_OFF) {
+ req->rq_timeout = PING_SVC_TIMEOUT +
+ at_get(&imp->imp_at.iat_net_latency);
+ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+ }
+
ptlrpcd_add_req(req);
} else {
CERROR("OOM trying to ping %s->%s\n",
}
EXPORT_SYMBOL(ptlrpc_ping);
-void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+static void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
{
#ifdef ENABLE_PINGER
- int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
- if (imp->imp_state == LUSTRE_IMP_DISCON) {
- int dtime = max_t(int, CONNECTION_SWITCH_MIN,
- AT_OFF ? 0 :
- at_get(&imp->imp_at.iat_net_latency));
- time = min(time, dtime);
+ cfs_time_t delay, dtime, ctime = cfs_time_current();
+
+ if (imp->imp_state == LUSTRE_IMP_DISCON ||
+ imp->imp_state == LUSTRE_IMP_CONNECTING) {
+ /* In the disconnected case aggressively reconnect, for
+ * this request the AT service timeout will be set to
+ * INITIAL_CONNECT_TIMEOUT. To ensure the request times
+ * out before we send another we add one extra second. */
+ dtime = cfs_time_seconds(max_t(int, CONNECTION_SWITCH_MIN,
+ AT_OFF ? 0 : INITIAL_CONNECT_TIMEOUT + 1 +
+ at_get(&imp->imp_at.iat_net_latency)));
+ } else {
+ /* In the common case we want to cluster the pings at
+ * at regular intervals to minimize system noise. */
+ delay = cfs_time_seconds(soon ? PING_INTERVAL_SHORT :
+ PING_INTERVAL);
+ dtime = delay - (ctime % delay);
}
- imp->imp_next_ping = cfs_time_shift(time);
+ /* May harmlessly race with ptlrpc_update_next_ping() */
+ imp->imp_next_ping = cfs_time_add(ctime, dtime);
+
+ CDEBUG(D_HA, "Setting %s next ping to "CFS_TIME_T" ("CFS_TIME_T")\n",
+ obd2cli_tgt(imp->imp_obd), imp->imp_next_ping, dtime);
+
#endif /* ENABLE_PINGER */
}
cfs_time_t this_ping = cfs_time_current();
struct l_wait_info lwi;
cfs_duration_t time_to_next_wake;
+ cfs_time_t time_of_next_wake;
struct timeout_item *item;
struct list_head *iter;
+ time_to_next_wake = cfs_time_seconds(PING_INTERVAL);
+ time_of_next_wake = cfs_time_shift(PING_INTERVAL);
+
mutex_down(&pinger_sem);
list_for_each_entry(item, &timeout_list, ti_chain) {
item->ti_cb(item, item->ti_cb_data);
ptlrpc_import_state_name(level), level,
force, imp->imp_deactive, imp->imp_pingable);
+ /* Include any ping which misses the deadline by up to
+ * 1/10 of a second. The pings are designed to clump
+ * and this helps ensure the entire batch gets sent
+ * promptly, which minimizes system noise from pings */
+
if (force ||
- /* if the next ping is within, say, 5 jiffies from
- now, go ahead and ping. See note below. */
- cfs_time_aftereq(this_ping,
- imp->imp_next_ping - 5 * CFS_TICK)) {
+ cfs_time_aftereq(this_ping, imp->imp_next_ping -
+ (cfs_time_seconds(1) + 9) / 10)) {
if (level == LUSTRE_IMP_DISCON &&
!imp_is_deactive(imp)) {
- /* wait at least a timeout before
- trying recovery again. */
- imp->imp_next_ping = cfs_time_shift(obd_timeout);
+ ptlrpc_update_next_ping(imp, 0);
ptlrpc_initiate_recovery(imp);
} else if (level != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov ||
imp->imp_obd->obd_no_recov);
} else if (imp->imp_pingable || force) {
ptlrpc_ping(imp);
+ /* ptlrpc_pinger_sending_on_import()
+ * will asynch update imp_next_ping
+ * so it must not be used below to
+ * calculate minimum wait time. */
+ continue;
}
} else {
if (!imp->imp_pingable)
imp->imp_next_ping, this_ping);
}
- /* obd_timeout might have changed */
- if (cfs_time_after(imp->imp_next_ping,
- cfs_time_add(this_ping,
- cfs_time_seconds(PING_INTERVAL))))
- ptlrpc_update_next_ping(imp, 0);
+ /* Wait time until next ping, or until we stopped. */
+ if (cfs_time_before(imp->imp_next_ping,
+ time_of_next_wake)) {
+ time_of_next_wake = imp->imp_next_ping;
+ time_to_next_wake = max_t(cfs_duration_t,
+ cfs_time_seconds(1),
+ cfs_time_sub(time_of_next_wake,
+ cfs_time_current()));
+ }
}
mutex_up(&pinger_sem);
- /* update memory usage info */
obd_update_maxusage();
+ CDEBUG(D_INFO, "next ping in "CFS_DURATION_T" ("CFS_TIME_T")\n",
+ time_to_next_wake, time_of_next_wake);
- /* Wait until the next ping time, or until we're stopped. */
- time_to_next_wake = pinger_check_timeout(this_ping);
- /* The ping sent by ptlrpc_send_rpc may get sent out
- say .01 second after this.
- ptlrpc_pinger_eending_on_import will then set the
- next ping time to next_ping + .01 sec, which means
- we will SKIP the next ping at next_ping, and the
- ping will get sent 2 timeouts from now! Beware. */
- CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("CFS_TIME_T")\n",
- time_to_next_wake,
- cfs_time_add(this_ping, cfs_time_seconds(PING_INTERVAL)));
if (time_to_next_wake > 0) {
- lwi = LWI_TIMEOUT(max_t(cfs_duration_t, time_to_next_wake, cfs_time_seconds(1)),
- NULL, NULL);
+ lwi = LWI_TIMEOUT(time_to_next_wake, NULL, NULL);
l_wait_event(thread->t_ctl_waitq,
thread->t_flags & (SVC_STOPPING|SVC_EVENT),
&lwi);
list_entry(iter, struct obd_import, imp_pinger_chain);
int generation, level;
- if (cfs_time_aftereq(pd->pd_this_ping,
- imp->imp_next_ping - 5 * CFS_TICK)) {
+ /* Include any ping within 1/10 of a second of the deadline */
+ if (cfs_time_aftereq(pd->pd_this_ping, imp->imp_next_ping -
+ (cfs_time_seconds(1) + 9) / 10)) {
/* Add a ping. */
spin_lock(&imp->imp_lock);
generation = imp->imp_generation;