From fbe66c3a22bfa136c077cbd46c4186ecbf4946d2 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Tue, 31 Aug 2010 01:17:09 +0400 Subject: [PATCH 1/1] Revert "b=22423 Reconnects are not throttled" This reverts commit 52a7f6e56654faa1fa2ed6d9a745cc018de6ff86. --- lustre/ptlrpc/import.c | 42 +++++++++++++++++++++++++++++++++++++++--- lustre/tests/conf-sanity.sh | 20 +++++--------------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 272d86c..1a461f1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -476,6 +476,12 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nid2str(conn->oic_conn->c_peer.nid), conn->oic_last_attempt); + /* Don't thrash connections */ + if (cfs_time_before_64(cfs_time_current_64(), + conn->oic_last_attempt + + cfs_time_seconds(CONNECTION_SWITCH_MIN))) { + continue; + } /* If we have not tried this connection since the last successful attempt, go with this one */ @@ -726,12 +732,42 @@ EXPORT_SYMBOL(ptlrpc_connect_import); static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) { #ifdef __KERNEL__ - /* the pinger takes care of issuing the next reconnect request */ - return; + struct obd_import_conn *imp_conn; +#endif + int wake_pinger = 0; + + ENTRY; + + cfs_spin_lock(&imp->imp_lock); + if (cfs_list_empty(&imp->imp_conn_list)) + GOTO(unlock, 0); + +#ifdef __KERNEL__ + imp_conn = cfs_list_entry(imp->imp_conn_list.prev, + struct obd_import_conn, + oic_item); + + /* XXX: When the failover node is the primary node, it is possible + * to have two identical connections in imp_conn_list. We must + * compare not conn's pointers but NIDs, otherwise we can defeat + * connection throttling. (See bug 14774.) */ + if (imp->imp_conn_current->oic_conn->c_peer.nid != + imp_conn->oic_conn->c_peer.nid) { + ptlrpc_ping_import_soon(imp); + wake_pinger = 1; + } #else /* liblustre has no pinger thread, so we wakeup pinger anyway */ - ptlrpc_pinger_wake_up(); + wake_pinger = 1; #endif + + unlock: + cfs_spin_unlock(&imp->imp_lock); + + if (wake_pinger) + ptlrpc_pinger_wake_up(); + + EXIT; } static int ptlrpc_busy_reconnect(int rc) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index f4d8546..055302d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1374,21 +1374,19 @@ test_35b() { # bug 18674 at_max_set 0 mds client fi - mkdir -p $MOUNT/$tdir + mkdir -p $MOUNT/testdir + touch $MOUNT/testdir/test log "Injecting EBUSY on MDS" # Setting OBD_FAIL_MDS_RESEND=0x136 do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2 - $LCTL set_param mdc.${FSNAME}*.stats=clear - - log "Creating a test file and stat it" - touch $MOUNT/$tdir/$tfile - stat $MOUNT/$tdir/$tfile + log "Stat on a test file" + stat $MOUNT/testdir/test log "Stop injecting EBUSY on MDS" do_facet mds "$LCTL set_param fail_loc=0" || return 3 - rm -f $MOUNT/$tdir/$tfile + rm -f $MOUNT/testdir/test log "done" # restore adaptive timeout @@ -1396,8 +1394,6 @@ test_35b() { # bug 18674 $LCTL dk $TMP/lustre-log-$TESTNAME.log - CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'` - # retrieve from the log if the client has ever tried to # contact the fake server after the loss of connection FAILCONN=`awk "BEGIN {ret = 0;} @@ -1417,12 +1413,6 @@ test_35b() { # bug 18674 log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \ return 5 - # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout - # Reconnects are supposed to be rate limited to one every 5s - [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \ - log "ERROR: Too many reconnects $CONNCNT" && \ - return 6 - cleanup # remove nid settings writeconf -- 1.8.3.1