else
strncpy(nidstr, "<none>", sizeof(nidstr));
seq_printf(m, " ]\n"
+ " nids_stats:");
+ list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+ libcfs_nidstr_r(&conn->oic_conn->c_peer.nid,
+ nidstr, sizeof(nidstr));
+ seq_printf(m, "\n \"%s\": { connects: %u, replied: %u,"
+ " uptodate: %s, sec_ago: ",
+ nidstr, conn->oic_attempts, conn->oic_replied,
+ conn->oic_uptodate ? "true" : "false");
+ if (conn->oic_last_attempt)
+ seq_printf(m, "%lld }", ktime_get_seconds() -
+ conn->oic_last_attempt);
+ else
+ seq_puts(m, "never }");
+ }
+ if (imp->imp_connection)
+ libcfs_nidstr_r(&imp->imp_connection->c_peer.nid,
+ nidstr, sizeof(nidstr));
+ else
+ strncpy(nidstr, "<none>", sizeof(nidstr));
+ seq_printf(m, "\n"
" current_connection: \"%s\"\n"
" connection_attempts: %u\n"
" generation: %u\n"
*/
static int import_select_connection(struct obd_import *imp)
{
- struct obd_import_conn *imp_conn = NULL, *conn;
+ struct obd_import_conn *imp_conn = NULL, *conn, *lru_conn = NULL;
struct obd_export *dlmexp;
char *target_start;
- int target_len, tried_all = 1;
+ int target_len;
+ bool tried_all = true;
int rc = 0;
ENTRY;
GOTO(out_unlock, rc);
}
+ /* if forced, simply choose the current one */
+ if (imp->imp_force_reconnect) {
+ LASSERT(imp->imp_conn_current);
+ imp_conn = imp->imp_conn_current;
+ tried_all = false;
+ goto connect;
+ }
+
list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
CDEBUG(D_HA, "%s: connect to NID %s last attempt %lld\n",
imp->imp_obd->obd_name,
libcfs_nidstr(&conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
+ conn->oic_uptodate =
+ LNetPeerDiscovered(&conn->oic_conn->c_peer.nid);
+ /* track least recently used conn for fallback */
+ if (!lru_conn ||
+ lru_conn->oic_last_attempt > conn->oic_last_attempt)
+ lru_conn = conn;
+
/* If we have not tried this connection since
- * the last successful attempt, go with this one
+ * the last successful attempt or ever (0 value)
*/
- if ((conn->oic_last_attempt == 0) ||
- conn->oic_last_attempt <= imp->imp_last_success_conn) {
- imp_conn = conn;
- tried_all = 0;
- break;
+ if (conn->oic_last_attempt <= imp->imp_last_success_conn) {
+ tried_all = false;
+ if (conn->oic_uptodate) {
+ imp_conn = conn;
+ break;
+ }
+ CDEBUG(D_HA, "%s: skip NID %s as not ready\n",
+ imp->imp_obd->obd_name,
+ libcfs_nidstr(&conn->oic_conn->c_peer.nid));
}
-
- /* If all of the connections have already been tried
- * since the last successful connection; just choose the
- * least recently used
- */
- if (!imp_conn)
- imp_conn = conn;
- else if (imp_conn->oic_last_attempt > conn->oic_last_attempt)
- imp_conn = conn;
}
- /* if not found, simply choose the current one */
- if (!imp_conn || imp->imp_force_reconnect) {
- LASSERT(imp->imp_conn_current);
- imp_conn = imp->imp_conn_current;
- tried_all = 0;
- }
+ /* no ready connections or all are tried in this round */
+ if (!imp_conn)
+ imp_conn = lru_conn;
+
LASSERT(imp_conn->oic_conn);
- /* If we've tried everything, and we're back to the beginning of the
- * list, increase our timeout and try again. It will be reset when
- * we do finally connect. (FIXME: really we should wait for all network
- * state associated with the last connection attempt to drain before
- * trying to reconnect on it.)
- */
- if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+ if (!tried_all) {
struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
timeout_t timeout = obd_at_get(imp->imp_obd, at);
+ /* make it quick at first round */
+ if (timeout > CONNECTION_SWITCH_MIN)
+ at_reset(at, CONNECTION_SWITCH_MAX);
+ } else if (imp->imp_conn_list.next == &imp_conn->oic_item) {
+ struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+ timeout_t timeout = obd_at_get(imp->imp_obd, at);
+
+ /* If we've tried everything, and we're back to the beginning
+ * of the list, increase timeout and try again. It will be
+ * reset when we do finally connect.
+ * FIXME: really we should wait for all network state
+ * associated with the last connection attempt to drain before
+ * trying to reconnect on it.
+ */
if (timeout < CONNECTION_SWITCH_MAX) {
obd_at_measure(imp->imp_obd, at,
timeout + CONNECTION_SWITCH_INC);
imp->imp_obd->obd_name, timeout);
}
+connect:
imp_conn->oic_last_attempt = ktime_get_seconds();
+ imp_conn->oic_attempts++;
/* switch connection, don't mind if it's same as the current one */
ptlrpc_connection_put(imp->imp_connection);
spin_unlock(&imp->imp_lock);
LASSERT(imp->imp_conn_current);
+ imp->imp_conn_current->oic_replied++;
msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
}
run_test 152 "seq allocation error in OSP"
+test_153a() {
+ reformat_and_config
+
+ start_mds || error "MDS start failed"
+ start_ost || error "OST start failed"
+
+ local nid=$($LCTL list_nids | grep ${NETTYPE} | head -n1)
+ local net=${nid#*@}
+ local MGS_NID=$(do_facet mgs $LCTL list_nids | head -1)
+ local OST1_NID=$(do_facet ost1 $LCTL list_nids | head -1)
+ local FAKE_PNID="192.168.252.112@${net}"
+ local FAKE_NIDS="${FAKE_PNID},${FAKE_PNID}2"
+ local FAKE_FAILOVER="10.252.252.113@${net},10.252.252.113@${net}2"
+ local NIDS_AND_FAILOVER="$FAKE_NIDS:$FAKE_FAILOVER:$OST1_NID:$MGS_NID"
+ local period=0
+ local pid
+ local rc
+
+ mount -t lustre $NIDS_AND_FAILOVER:/lustre $MOUNT &
+ pid=$!
+ while (( period < 30 )); do
+ [[ -n "$(ps -p $pid -o pid=)" ]] || break
+ echo "waiting for mount ..."
+ sleep 5
+ period=$((period + 5))
+ done
+ $LCTL get_param mgc.MGC${FAKE_PNID}.import | grep "uptodate:"
+ check_mount || error "check_mount failed"
+ umount $MOUNT
+ cleanup || error "cleanup failed with rc $?"
+}
+run_test 153a "bypass invalid NIDs quickly"
+
#
# (This was sanity/802a)
#