static time64_t ptlrpc_inflight_timeout(struct obd_import *imp)
{
time64_t now = ktime_get_real_seconds();
- struct list_head *tmp, *n;
struct ptlrpc_request *req;
time64_t timeout = 0;
spin_lock(&imp->imp_lock);
- list_for_each_safe(tmp, n, &imp->imp_sending_list) {
- req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ list_for_each_entry(req, &imp->imp_sending_list, rq_list)
timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
- }
spin_unlock(&imp->imp_lock);
return timeout;
}
*/
void ptlrpc_invalidate_import(struct obd_import *imp)
{
- struct list_head *tmp, *n;
struct ptlrpc_request *req;
time64_t timeout;
int rc;
* this point. */
rc = 1;
} else {
- list_for_each_safe(tmp, n,
- &imp->imp_sending_list) {
- req = list_entry(tmp,
- struct ptlrpc_request,
- rq_list);
+ list_for_each_entry(req, &imp->imp_sending_list,
+ rq_list) {
DEBUG_REQ(D_ERROR, req,
"still on sending list");
}
- list_for_each_safe(tmp, n,
- &imp->imp_delayed_list) {
- req = list_entry(tmp,
- struct ptlrpc_request,
- rq_list);
+ list_for_each_entry(req, &imp->imp_delayed_list,
+ rq_list) {
DEBUG_REQ(D_ERROR, req,
"still on delayed list");
}
int ptlrpc_reconnect_import(struct obd_import *imp)
{
-#ifdef ENABLE_PINGER
+#ifdef CONFIG_LUSTRE_FS_PINGER
long timeout_jiffies = cfs_time_seconds(obd_timeout);
int rc;
*/
static int import_select_connection(struct obd_import *imp)
{
- struct obd_import_conn *imp_conn = NULL, *conn;
- struct obd_export *dlmexp;
- char *target_start;
- int target_len, tried_all = 1;
- ENTRY;
+ struct obd_import_conn *imp_conn = NULL, *conn;
+ struct obd_export *dlmexp;
+ char *target_start;
+ int target_len, tried_all = 1;
+ int rc = 0;
+ ENTRY;
spin_lock(&imp->imp_lock);
if (list_empty(&imp->imp_conn_list)) {
- CERROR("%s: no connections available\n",
- imp->imp_obd->obd_name);
- spin_unlock(&imp->imp_lock);
- RETURN(-EINVAL);
+ rc = -EINVAL;
+ CERROR("%s: no connections available: rc = %d\n",
+ imp->imp_obd->obd_name, rc);
+ GOTO(out_unlock, rc);
}
list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
- imp->imp_obd->obd_name,
- libcfs_nid2str(conn->oic_conn->c_peer.nid),
- conn->oic_last_attempt);
+ imp->imp_obd->obd_name,
+ libcfs_nid2str(conn->oic_conn->c_peer.nid),
+ conn->oic_last_attempt);
- /* If we have not tried this connection since
- the last successful attempt, go with this one */
- if ((conn->oic_last_attempt == 0) ||
+ /* If we have not tried this connection since
+ * the last successful attempt, go with this one
+ */
+ if ((conn->oic_last_attempt == 0) ||
conn->oic_last_attempt <= imp->imp_last_success_conn) {
- imp_conn = conn;
- tried_all = 0;
- break;
- }
+ imp_conn = conn;
+ tried_all = 0;
+ break;
+ }
- /* If all of the connections have already been tried
- since the last successful connection; just choose the
- least recently used */
- if (!imp_conn)
- imp_conn = conn;
+ /* If all of the connections have already been tried
+ * since the last successful connection; just choose the
+ * least recently used
+ */
+ if (!imp_conn)
+ imp_conn = conn;
else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
- imp_conn = conn;
- }
+ imp_conn = conn;
+ }
- /* if not found, simply choose the current one */
- if (!imp_conn || imp->imp_force_reconnect) {
- LASSERT(imp->imp_conn_current);
- imp_conn = imp->imp_conn_current;
- tried_all = 0;
- }
- LASSERT(imp_conn->oic_conn);
-
- /* If we've tried everything, and we're back to the beginning of the
- list, increase our timeout and try again. It will be reset when
- we do finally connect. (FIXME: really we should wait for all network
- state associated with the last connection attempt to drain before
- trying to reconnect on it.) */
- if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+ /* if not found, simply choose the current one */
+ if (!imp_conn || imp->imp_force_reconnect) {
+ LASSERT(imp->imp_conn_current);
+ imp_conn = imp->imp_conn_current;
+ tried_all = 0;
+ }
+ LASSERT(imp_conn->oic_conn);
+
+ /* If we've tried everything, and we're back to the beginning of the
+ * list, increase our timeout and try again. It will be reset when
+ * we do finally connect. (FIXME: really we should wait for all network
+ * state associated with the last connection attempt to drain before
+ * trying to reconnect on it.)
+ */
+ if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+
if (at_get(at) < CONNECTION_SWITCH_MAX) {
at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
if (at_get(at) > CONNECTION_SWITCH_MAX)
at_reset(at, CONNECTION_SWITCH_MAX);
}
LASSERT(imp_conn->oic_last_attempt);
- CDEBUG(D_HA, "%s: tried all connections, increasing latency "
- "to %ds\n", imp->imp_obd->obd_name, at_get(at));
+ CDEBUG(D_HA,
+ "%s: tried all connections, increasing latency to %ds\n",
+ imp->imp_obd->obd_name, at_get(at));
}
imp_conn->oic_last_attempt = ktime_get_seconds();
- /* switch connection, don't mind if it's same as the current one */
- if (imp->imp_connection)
- ptlrpc_connection_put(imp->imp_connection);
- imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
-
- dlmexp = class_conn2export(&imp->imp_dlm_handle);
- LASSERT(dlmexp != NULL);
- if (dlmexp->exp_connection)
- ptlrpc_connection_put(dlmexp->exp_connection);
- dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
- class_export_put(dlmexp);
-
- if (imp->imp_conn_current != imp_conn) {
- if (imp->imp_conn_current) {
- deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
- &target_start, &target_len);
-
- CDEBUG(D_HA, "%s: Connection changing to"
- " %.*s (at %s)\n",
- imp->imp_obd->obd_name,
- target_len, target_start,
- libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
- }
+ /* switch connection, don't mind if it's same as the current one */
+ ptlrpc_connection_put(imp->imp_connection);
+ imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
- imp->imp_conn_current = imp_conn;
- }
+ dlmexp = class_conn2export(&imp->imp_dlm_handle);
+ if (!dlmexp)
+ GOTO(out_unlock, rc = -EINVAL);
+ ptlrpc_connection_put(dlmexp->exp_connection);
+ dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+ class_export_put(dlmexp);
+
+ if (imp->imp_conn_current != imp_conn) {
+ if (imp->imp_conn_current) {
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+ &target_start, &target_len);
+
+ CDEBUG(D_HA, "%s: Connection changing to"
+ " %.*s (at %s)\n",
+ imp->imp_obd->obd_name,
+ target_len, target_start,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ }
+
+ imp->imp_conn_current = imp_conn;
+ }
/* The below message is checked in conf-sanity.sh test_35[ab] */
- CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
- imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
- libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+ imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+out_unlock:
spin_unlock(&imp->imp_lock);
-
- RETURN(0);
+ RETURN(rc);
}
/*
*/
static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
{
- struct ptlrpc_request *req;
- struct list_head *tmp;
+ struct ptlrpc_request *req;
/* The requests in committed_list always have smaller transnos than
* the requests in replay_list */
if (!list_empty(&imp->imp_committed_list)) {
- tmp = imp->imp_committed_list.next;
- req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ req = list_first_entry(&imp->imp_committed_list,
+ struct ptlrpc_request, rq_replay_list);
*transno = req->rq_transno;
if (req->rq_transno == 0) {
DEBUG_REQ(D_ERROR, req,
return 1;
}
if (!list_empty(&imp->imp_replay_list)) {
- tmp = imp->imp_replay_list.next;
- req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ req = list_first_entry(&imp->imp_committed_list,
+ struct ptlrpc_request, rq_replay_list);
*transno = req->rq_transno;
if (req->rq_transno == 0) {
DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
/* Report the rpc service time to the server so that it knows how long
* to wait for clients to join recovery */
- lustre_msg_set_service_time(request->rq_reqmsg,
- at_timeout2est(request->rq_timeout));
+ lustre_msg_set_service_timeout(request->rq_reqmsg,
+ at_timeout2est(request->rq_timeout));
/* The amount of time we give the server to process the connect req.
* import_select_connection will increase the net latency on
lustre_msg_add_op_flags(request->rq_reqmsg,
MSG_CONNECT_TRANSNO);
- DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %ld)",
+ DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
request->rq_timeout);
ptlrpcd_add_req(request);
rc = 0;
struct obd_import *imp = request->rq_import;
struct lustre_handle old_hdl;
__u64 old_connect_flags;
+ timeout_t service_timeout;
int msg_flags;
struct obd_connect_data *ocd;
struct obd_export *exp = NULL;
* for connecting*/
imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
spin_unlock(&imp->imp_lock);
- ptlrpc_maybe_ping_import_soon(imp);
GOTO(out, rc);
}
imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
/* The net statistics after (re-)connect is not valid anymore,
- * because may reflect other routing, etc. */
+ * because may reflect other routing, etc.
+ */
+ service_timeout = lustre_msg_get_service_timeout(request->rq_repmsg);
at_reinit(&imp->imp_at.iat_net_latency, 0, 0);
- ptlrpc_at_adj_net_latency(request,
- lustre_msg_get_service_time(
- request->rq_repmsg));
+ ptlrpc_at_adj_net_latency(request, service_timeout);
/* Import flags should be updated before waking import at FULL state */
rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp,
* with server again
*/
if ((MSG_CONNECT_RECOVERING & msg_flags)) {
- CDEBUG(level,
+ CDEBUG_LIMIT(level,
"%s@%s changed server handle from "
"%#llx to %#llx"
" but is still in recovery\n",
if (rc != 0) {
bool inact = false;
+ time64_t now = ktime_get_seconds();
+ time64_t next_connect;
import_set_state_nolock(imp, LUSTRE_IMP_DISCON);
if (rc == -EACCES) {
import_set_state_nolock(imp, LUSTRE_IMP_CLOSED);
inact = true;
}
+ } else if (rc == -ENODEV || rc == -ETIMEDOUT) {
+ /* ENODEV means there is no service, force reconnection
+ * to a pair if attempt happen ptlrpc_next_reconnect
+ * before now. ETIMEDOUT could be set during network
+ * error and do not guarantee request deadline happened.
+ */
+ struct obd_import_conn *conn;
+ time64_t reconnect_time;
+
+ /* Same as ptlrpc_next_reconnect, but in past */
+ reconnect_time = now - INITIAL_CONNECT_TIMEOUT;
+ list_for_each_entry(conn, &imp->imp_conn_list,
+ oic_item) {
+ if (conn->oic_last_attempt <= reconnect_time) {
+ imp->imp_force_verify = 1;
+ break;
+ }
+ }
}
+
+ next_connect = imp->imp_conn_current->oic_last_attempt +
+ (request->rq_deadline - request->rq_sent);
spin_unlock(&imp->imp_lock);
if (inact)
if (rc == -EPROTO)
RETURN(rc);
+ /* adjust imp_next_ping to request deadline + 1 and reschedule
+ * a pinger if import lost processing during CONNECTING or far
+ * away from request deadline. It could happen when connection
+ * was initiated outside of pinger, like
+ * ptlrpc_set_import_discon().
+ */
+ if (!imp->imp_force_verify && (imp->imp_next_ping <= now ||
+ imp->imp_next_ping > next_connect)) {
+ imp->imp_next_ping = max(now, next_connect) + 1;
+ ptlrpc_pinger_wake_up();
+ }
+
ptlrpc_maybe_ping_import_soon(imp);
CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
RETURN(0);
- LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
- atomic_inc(&imp->imp_replay_inflight);
+ if (!atomic_add_unless(&imp->imp_replay_inflight, 1, 1))
+ RETURN(0);
req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
OBD_PING);
struct obd_import *imp = data;
ENTRY;
-
- unshare_fs_struct();
-
CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
imp->imp_connection->c_remote_uuid.uuid);
ENTRY;
if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+ struct task_struct *task;
+
deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
&target_start, &target_len);
/* Don't care about MGC eviction */
"using this service will fail.\n",
imp->imp_obd->obd_name, target_len,
target_start);
- LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction");
+ LASSERTF(!obd_lbug_on_eviction, "LBUG upon eviction\n");
}
CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
obd2cli_tgt(imp->imp_obd),
imp->imp_vbr_failed = 0;
spin_unlock(&imp->imp_lock);
- {
- struct task_struct *task;
/* bug 17802: XXX client_disconnect_export vs connect request
* race. if client is evicted at this time then we start
* invalidate thread without reference to import and import can
* be freed at same time. */
class_import_get(imp);
task = kthread_run(ptlrpc_invalidate_import_thread, imp,
- "ll_imp_inval");
+ "ll_imp_inval");
if (IS_ERR(task)) {
class_import_put(imp);
- CERROR("error starting invalidate thread: %d\n", rc);
rc = PTR_ERR(task);
+ CERROR("%s: can't start invalidate thread: rc = %d\n",
+ imp->imp_obd->obd_name, rc);
} else {
rc = 0;
}
RETURN(rc);
- }
}
if (imp->imp_state == LUSTRE_IMP_REPLAY) {
GOTO(out, rc);
ptlrpc_activate_import(imp, true);
- CDEBUG_LIMIT(imp->imp_was_idle ?
- imp->imp_idle_debug : D_CONSOLE,
- "%s: Connection restored to %s (at %s)\n",
- imp->imp_obd->obd_name,
- obd_uuid2str(&conn->c_remote_uuid),
- obd_import_nid2str(imp));
+ /* Reverse import are flagged with dlm_fake == 1.
+ * They do not do recovery and connection are not "restored".
+ */
+ if (!imp->imp_dlm_fake)
+ CDEBUG_LIMIT(imp->imp_was_idle ?
+ imp->imp_idle_debug : D_CONSOLE,
+ "%s: Connection restored to %s (at %s)\n",
+ imp->imp_obd->obd_name,
+ obd_uuid2str(&conn->c_remote_uuid),
+ obd_import_nid2str(imp));
spin_lock(&imp->imp_lock);
imp->imp_was_idle = 0;
spin_unlock(&imp->imp_lock);
/* We want client umounts to happen quickly, no matter the
server state... */
- req->rq_timeout = min_t(int, req->rq_timeout,
+ req->rq_timeout = min_t(timeout_t, req->rq_timeout,
INITIAL_CONNECT_TIMEOUT);
import_set_state(imp, LUSTRE_IMP_CONNECTING);
DEBUG_REQ(D_HA, req, "inflight=%d, refcount=%d: rc = %d",
atomic_read(&imp->imp_inflight),
- atomic_read(&imp->imp_refcount), rc);
+ refcount_read(&imp->imp_refcount), rc);
spin_lock(&imp->imp_lock);
/* DISCONNECT reply can be late and another connection can just
memset(&imp->imp_remote_handle, 0,
sizeof(imp->imp_remote_handle));
/* take our DISCONNECT into account */
- if (atomic_read(&imp->imp_inflight) > 1) {
+ if (atomic_read(&imp->imp_reqs) > 1) {
imp->imp_generation++;
imp->imp_initiated_at = imp->imp_generation;
import_set_state_nolock(imp, LUSTRE_IMP_NEW);
/* Adaptive Timeout utils */
-/* Update at_current with the specified value (bounded by at_min and at_max),
- * as well as the AT history "bins".
+/* Update at_current_timeout with the specified value (bounded by at_min and
+ * at_max), as well as the AT history "bins".
* - Bin into timeslices using AT_BINS bins.
* - This gives us a max of the last at_history seconds without the storage,
* but still smoothing out a return to normalcy from a slow response.
* - (E.g. remember the maximum latency in each minute of the last 4 minutes.)
*/
-int at_measured(struct adaptive_timeout *at, unsigned int val)
+timeout_t at_measured(struct adaptive_timeout *at, timeout_t timeout)
{
- unsigned int old = at->at_current;
+ timeout_t old_timeout = at->at_current_timeout;
time64_t now = ktime_get_real_seconds();
long binlimit = max_t(long, at_history / AT_BINS, 1);
LASSERT(at);
- CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
- val, at, (long)(now - at->at_binstart), at->at_current,
+ CDEBUG(D_OTHER, "add %u to %p time=%lld v=%u (%u %u %u %u)\n",
+ timeout, at, now - at->at_binstart, at->at_current_timeout,
at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
- if (val == 0)
- /* 0's don't count, because we never want our timeout to
- drop to 0, and because 0 could mean an error */
+ if (timeout <= 0)
+ /* Negative timeouts and 0's don't count, because we never
+ * want our timeout to drop to 0 or below, and because 0 could
+ * mean an error
+ */
return 0;
spin_lock(&at->at_lock);
if (unlikely(at->at_binstart == 0)) {
/* Special case to remove default from history */
- at->at_current = val;
- at->at_worst_ever = val;
- at->at_worst_time = now;
- at->at_hist[0] = val;
+ at->at_current_timeout = timeout;
+ at->at_worst_timeout_ever = timeout;
+ at->at_worst_timestamp = now;
+ at->at_hist[0] = timeout;
at->at_binstart = now;
} else if (now - at->at_binstart < binlimit ) {
/* in bin 0 */
- at->at_hist[0] = max(val, at->at_hist[0]);
- at->at_current = max(val, at->at_current);
+ at->at_hist[0] = max_t(timeout_t, timeout, at->at_hist[0]);
+ at->at_current_timeout = max_t(timeout_t, timeout,
+ at->at_current_timeout);
} else {
int i, shift;
- unsigned int maxv = val;
+ timeout_t maxv = timeout;
/* move bins over */
shift = (u32)(now - at->at_binstart) / binlimit;
for(i = AT_BINS - 1; i >= 0; i--) {
if (i >= shift) {
at->at_hist[i] = at->at_hist[i - shift];
- maxv = max(maxv, at->at_hist[i]);
+ maxv = max_t(timeout_t, maxv, at->at_hist[i]);
} else {
at->at_hist[i] = 0;
}
}
- at->at_hist[0] = val;
- at->at_current = maxv;
+ at->at_hist[0] = timeout;
+ at->at_current_timeout = maxv;
at->at_binstart += shift * binlimit;
}
- if (at->at_current > at->at_worst_ever) {
- at->at_worst_ever = at->at_current;
- at->at_worst_time = now;
- }
+ if (at->at_current_timeout > at->at_worst_timeout_ever) {
+ at->at_worst_timeout_ever = at->at_current_timeout;
+ at->at_worst_timestamp = now;
+ }
- if (at->at_flags & AT_FLG_NOHIST)
+ if (at->at_flags & AT_FLG_NOHIST)
/* Only keep last reported val; keeping the rest of the history
- for proc only */
- at->at_current = val;
+ * for debugfs only
+ */
+ at->at_current_timeout = timeout;
if (at_max > 0)
- at->at_current = min(at->at_current, at_max);
- at->at_current = max(at->at_current, at_min);
-
- if (at->at_current != old)
- CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
- "(val=%u) hist %u %u %u %u\n", at,
- old, at->at_current, at->at_current - old, val,
+ at->at_current_timeout = min_t(timeout_t,
+ at->at_current_timeout, at_max);
+ at->at_current_timeout = max_t(timeout_t, at->at_current_timeout,
+ at_min);
+ if (at->at_current_timeout != old_timeout)
+ CDEBUG(D_OTHER,
+ "AT %p change: old=%u new=%u delta=%d (val=%d) hist %u %u %u %u\n",
+ at, old_timeout, at->at_current_timeout,
+ at->at_current_timeout - old_timeout, timeout,
at->at_hist[0], at->at_hist[1], at->at_hist[2],
at->at_hist[3]);
- /* if we changed, report the old value */
- old = (at->at_current != old) ? old : 0;
+ /* if we changed, report the old timeout value */
+ old_timeout = (at->at_current_timeout != old_timeout) ? old_timeout : 0;
spin_unlock(&at->at_lock);
- return old;
+ return old_timeout;
}
/* Find the imp_at index for a given portal; assign if space available */