imp->imp_obd->obd_name,
libcfs_nid2str(conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
+ /* Don't thrash connections */
+ if (cfs_time_before_64(cfs_time_current_64(),
+ conn->oic_last_attempt +
+ cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
+ continue;
+ }
/* If we have not tried this connection since
the last successful attempt, go with this one */
static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
{
#ifdef __KERNEL__
- /* the pinger takes care of issuing the next reconnect request */
- return;
+ struct obd_import_conn *imp_conn;
+#endif
+ int wake_pinger = 0;
+
+ ENTRY;
+
+ cfs_spin_lock(&imp->imp_lock);
+ if (cfs_list_empty(&imp->imp_conn_list))
+ GOTO(unlock, 0);
+
+#ifdef __KERNEL__
+ imp_conn = cfs_list_entry(imp->imp_conn_list.prev,
+ struct obd_import_conn,
+ oic_item);
+
+ /* XXX: When the failover node is the primary node, it is possible
+ * to have two identical connections in imp_conn_list. We must
+ * compare not conn's pointers but NIDs, otherwise we can defeat
+ * connection throttling. (See bug 14774.) */
+ if (imp->imp_conn_current->oic_conn->c_peer.nid !=
+ imp_conn->oic_conn->c_peer.nid) {
+ ptlrpc_ping_import_soon(imp);
+ wake_pinger = 1;
+ }
#else
/* liblustre has no pinger thread, so we wakeup pinger anyway */
- ptlrpc_pinger_wake_up();
+ wake_pinger = 1;
#endif
+
+ unlock:
+ cfs_spin_unlock(&imp->imp_lock);
+
+ if (wake_pinger)
+ ptlrpc_pinger_wake_up();
+
+ EXIT;
}
static int ptlrpc_busy_reconnect(int rc)
at_max_set 0 mds client
fi
- mkdir -p $MOUNT/$tdir
+ mkdir -p $MOUNT/testdir
+ touch $MOUNT/testdir/test
log "Injecting EBUSY on MDS"
# Setting OBD_FAIL_MDS_RESEND=0x136
do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2
- $LCTL set_param mdc.${FSNAME}*.stats=clear
-
- log "Creating a test file and stat it"
- touch $MOUNT/$tdir/$tfile
- stat $MOUNT/$tdir/$tfile
+ log "Stat on a test file"
+ stat $MOUNT/testdir/test
log "Stop injecting EBUSY on MDS"
do_facet mds "$LCTL set_param fail_loc=0" || return 3
- rm -f $MOUNT/$tdir/$tfile
+ rm -f $MOUNT/testdir/test
log "done"
# restore adaptive timeout
$LCTL dk $TMP/lustre-log-$TESTNAME.log
- CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'`
-
# retrieve from the log if the client has ever tried to
# contact the fake server after the loss of connection
FAILCONN=`awk "BEGIN {ret = 0;}
log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
return 5
- # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout
- # Reconnects are supposed to be rate limited to one every 5s
- [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \
- log "ERROR: Too many reconnects $CONNCNT" && \
- return 6
-
cleanup
# remove nid settings
writeconf