From 86b2211e55dcc509da85b21ece8830e2a9b70db1 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Tue, 14 Jun 2011 19:52:55 -0700 Subject: [PATCH] LU-290 Reconnects are not throttled bz22423 Client should not try to reconnect to the same nid in a busy loop, but instead rely on pinger to issue reconnect. Signed-off-by: Lai Siyao Change-Id: I878093030d62173071192ca816e037464cc9248d Reviewed-on: http://review.whamcloud.com/944 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Johann Lombardi Reviewed-by: hongchao.zhang Tested-by: hongchao.zhang Reviewed-by: Oleg Drokin --- lustre/ptlrpc/import.c | 42 +++--------------------------------------- lustre/tests/conf-sanity.sh | 23 +++++++++++++++++------ 2 files changed, 20 insertions(+), 45 deletions(-) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index ef403ce..0934764 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -479,12 +479,6 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nid2str(conn->oic_conn->c_peer.nid), conn->oic_last_attempt); - /* Don't thrash connections */ - if (cfs_time_before_64(cfs_time_current_64(), - conn->oic_last_attempt + - cfs_time_seconds(CONNECTION_SWITCH_MIN))) { - continue; - } /* If we have not tried this connection since the last successful attempt, go with this one */ @@ -735,42 +729,12 @@ EXPORT_SYMBOL(ptlrpc_connect_import); static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) { #ifdef __KERNEL__ - struct obd_import_conn *imp_conn; -#endif - int wake_pinger = 0; - - ENTRY; - - cfs_spin_lock(&imp->imp_lock); - if (cfs_list_empty(&imp->imp_conn_list)) - GOTO(unlock, 0); - -#ifdef __KERNEL__ - imp_conn = cfs_list_entry(imp->imp_conn_list.prev, - struct obd_import_conn, - oic_item); - - /* XXX: When the failover node is the primary node, it is possible - * to have two identical connections in imp_conn_list. We must - * compare not conn's pointers but NIDs, otherwise we can defeat - * connection throttling. (See bug 14774.) */ - if (imp->imp_conn_current->oic_conn->c_peer.nid != - imp_conn->oic_conn->c_peer.nid) { - ptlrpc_ping_import_soon(imp); - wake_pinger = 1; - } + /* the pinger takes care of issuing the next reconnect request */ + return; #else /* liblustre has no pinger thread, so we wakeup pinger anyway */ - wake_pinger = 1; + ptlrpc_pinger_wake_up(); #endif - - unlock: - cfs_spin_unlock(&imp->imp_lock); - - if (wake_pinger) - ptlrpc_pinger_wake_up(); - - EXIT; } static int ptlrpc_busy_reconnect(int rc) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 990b215..c306f4a 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1493,19 +1493,21 @@ test_35b() { # bug 18674 at_max_set 0 mds client fi - mkdir -p $MOUNT/testdir - touch $MOUNT/testdir/test + mkdir -p $MOUNT/$tdir log "Injecting EBUSY on MDS" # Setting OBD_FAIL_MDS_RESEND=0x136 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x80000136" || return 2 - log "Stat on a test file" - stat $MOUNT/testdir/test + $LCTL set_param mdc.${FSNAME}*.stats=clear + + log "Creating a test file and stat it" + touch $MOUNT/$tdir/$tfile + stat $MOUNT/$tdir/$tfile log "Stop injecting EBUSY on MDS" do_facet $SINGLEMDS "$LCTL set_param fail_loc=0" || return 3 - rm -f $MOUNT/testdir/test + rm -f $MOUNT/$tdir/$tfile log "done" # restore adaptive timeout @@ -1513,6 +1515,8 @@ test_35b() { # bug 18674 $LCTL dk $TMP/lustre-log-$TESTNAME.log + CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'` + # retrieve from the log if the client has ever tried to # contact the fake server after the loss of connection FAILCONN=`awk "BEGIN {ret = 0;} @@ -1532,7 +1536,14 @@ test_35b() { # bug 18674 log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \ return 5 - cleanup + # LU-290 + # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout + # Reconnects are supposed to be rate limited to one every 5s + [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \ + log "ERROR: Too many reconnects $CONNCNT" && \ + return 6 + + cleanup # remove nid settings writeconf } -- 1.8.3.1