From: Mikhail Pershin Date: Thu, 19 Aug 2010 07:02:05 +0000 (+0400) Subject: b=22423 Reconnects are not throttled X-Git-Tag: 2.0.51.0~72 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=52a7f6e56654faa1fa2ed6d9a745cc018de6ff86 b=22423 Reconnects are not throttled Rely on pings to issue reconnects. Conf-sanity.sh test_35 modified. i=nathan.ruthman i=dmitry.zoguine --- diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 1a461f1..272d86c 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -476,12 +476,6 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nid2str(conn->oic_conn->c_peer.nid), conn->oic_last_attempt); - /* Don't thrash connections */ - if (cfs_time_before_64(cfs_time_current_64(), - conn->oic_last_attempt + - cfs_time_seconds(CONNECTION_SWITCH_MIN))) { - continue; - } /* If we have not tried this connection since the last successful attempt, go with this one */ @@ -732,42 +726,12 @@ EXPORT_SYMBOL(ptlrpc_connect_import); static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) { #ifdef __KERNEL__ - struct obd_import_conn *imp_conn; -#endif - int wake_pinger = 0; - - ENTRY; - - cfs_spin_lock(&imp->imp_lock); - if (cfs_list_empty(&imp->imp_conn_list)) - GOTO(unlock, 0); - -#ifdef __KERNEL__ - imp_conn = cfs_list_entry(imp->imp_conn_list.prev, - struct obd_import_conn, - oic_item); - - /* XXX: When the failover node is the primary node, it is possible - * to have two identical connections in imp_conn_list. We must - * compare not conn's pointers but NIDs, otherwise we can defeat - * connection throttling. (See bug 14774.) */ - if (imp->imp_conn_current->oic_conn->c_peer.nid != - imp_conn->oic_conn->c_peer.nid) { - ptlrpc_ping_import_soon(imp); - wake_pinger = 1; - } + /* the pinger takes care of issuing the next reconnect request */ + return; #else /* liblustre has no pinger thread, so we wakeup pinger anyway */ - wake_pinger = 1; + ptlrpc_pinger_wake_up(); #endif - - unlock: - cfs_spin_unlock(&imp->imp_lock); - - if (wake_pinger) - ptlrpc_pinger_wake_up(); - - EXIT; } static int ptlrpc_busy_reconnect(int rc) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 092ffbd..f5aea1a 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1366,19 +1366,21 @@ test_35b() { # bug 18674 at_max_set 0 mds client fi - mkdir -p $MOUNT/testdir - touch $MOUNT/testdir/test + mkdir -p $MOUNT/$tdir log "Injecting EBUSY on MDS" # Setting OBD_FAIL_MDS_RESEND=0x136 do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2 - log "Stat on a test file" - stat $MOUNT/testdir/test + $LCTL set_param mdc.${FSNAME}*.stats=clear + + log "Creating a test file and stat it" + touch $MOUNT/$tdir/$tfile + stat $MOUNT/$tdir/$tfile log "Stop injecting EBUSY on MDS" do_facet mds "$LCTL set_param fail_loc=0" || return 3 - rm -f $MOUNT/testdir/test + rm -f $MOUNT/$tdir/$tfile log "done" # restore adaptive timeout @@ -1386,6 +1388,8 @@ test_35b() { # bug 18674 $LCTL dk $TMP/lustre-log-$TESTNAME.log + CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'` + # retrieve from the log if the client has ever tried to # contact the fake server after the loss of connection FAILCONN=`awk "BEGIN {ret = 0;} @@ -1405,6 +1409,12 @@ test_35b() { # bug 18674 log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \ return 5 + # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout + # Reconnects are supposed to be rate limited to one every 5s + [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \ + log "ERROR: Too many reconnects $CONNCNT" && \ + return 6 + cleanup # remove nid settings writeconf