imp->imp_obd->obd_name,
libcfs_nid2str(conn->oic_conn->c_peer.nid),
conn->oic_last_attempt);
- /* Don't thrash connections */
- if (cfs_time_before_64(cfs_time_current_64(),
- conn->oic_last_attempt +
- cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
- continue;
- }
/* If we have not tried this connection since
the last successful attempt, go with this one */
static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
{
#ifdef __KERNEL__
- struct obd_import_conn *imp_conn;
-#endif
- int wake_pinger = 0;
-
- ENTRY;
-
- cfs_spin_lock(&imp->imp_lock);
- if (cfs_list_empty(&imp->imp_conn_list))
- GOTO(unlock, 0);
-
-#ifdef __KERNEL__
- imp_conn = cfs_list_entry(imp->imp_conn_list.prev,
- struct obd_import_conn,
- oic_item);
-
- /* XXX: When the failover node is the primary node, it is possible
- * to have two identical connections in imp_conn_list. We must
- * compare not conn's pointers but NIDs, otherwise we can defeat
- * connection throttling. (See bug 14774.) */
- if (imp->imp_conn_current->oic_conn->c_peer.nid !=
- imp_conn->oic_conn->c_peer.nid) {
- ptlrpc_ping_import_soon(imp);
- wake_pinger = 1;
- }
+ /* the pinger takes care of issuing the next reconnect request */
+ return;
#else
/* liblustre has no pinger thread, so we wakeup pinger anyway */
- wake_pinger = 1;
+ ptlrpc_pinger_wake_up();
#endif
-
- unlock:
- cfs_spin_unlock(&imp->imp_lock);
-
- if (wake_pinger)
- ptlrpc_pinger_wake_up();
-
- EXIT;
}
static int ptlrpc_busy_reconnect(int rc)
at_max_set 0 mds client
fi
- mkdir -p $MOUNT/testdir
- touch $MOUNT/testdir/test
+ mkdir -p $MOUNT/$tdir
log "Injecting EBUSY on MDS"
# Setting OBD_FAIL_MDS_RESEND=0x136
do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2
- log "Stat on a test file"
- stat $MOUNT/testdir/test
+ $LCTL set_param mdc.${FSNAME}*.stats=clear
+
+ log "Creating a test file and stat it"
+ touch $MOUNT/$tdir/$tfile
+ stat $MOUNT/$tdir/$tfile
log "Stop injecting EBUSY on MDS"
do_facet mds "$LCTL set_param fail_loc=0" || return 3
- rm -f $MOUNT/testdir/test
+ rm -f $MOUNT/$tdir/$tfile
log "done"
# restore adaptive timeout
$LCTL dk $TMP/lustre-log-$TESTNAME.log
+ CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'`
+
# retrieve from the log if the client has ever tried to
# contact the fake server after the loss of connection
FAILCONN=`awk "BEGIN {ret = 0;}
log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
return 5
+ # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout
+ # Reconnects are supposed to be rate limited to one every 5s
+ [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \
+ log "ERROR: Too many reconnects $CONNCNT" && \
+ return 6
+
cleanup
# remove nid settings
writeconf