Whamcloud - gitweb
Branch HEAD
authorjohann <johann>
Fri, 31 Aug 2007 12:48:09 +0000 (12:48 +0000)
committerjohann <johann>
Fri, 31 Aug 2007 12:48:09 +0000 (12:48 +0000)
b=12459
i=adilger,tianzy
i=scjody

Severity   : normal
Bugzilla   : 12459
Description: Client eviction due to failover config
Details    : after a connection loss, the lustre client should attempt to
     reconnect to the last active server first before trying the
     other potential connections.

lustre/ChangeLog
lustre/ptlrpc/import.c
lustre/tests/conf-sanity.sh

index 59e69a5..bacd497 100644 (file)
@@ -201,6 +201,13 @@ Bugzilla   : 11802
 Description: lustre support for RHEL5
 Details    : Add support for RHEL5.
 
+Severity   : normal
+Bugzilla   : 12459
+Description: Client eviction due to failover config
+Details    : after a connection loss, the lustre client should attempt to
+            reconnect to the last active server first before trying the
+            other potential connections.
+
 --------------------------------------------------------------------------------
 
 2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
index 0778530..0c7eff2 100644 (file)
@@ -290,7 +290,7 @@ static int import_select_connection(struct obd_import *imp)
                                        cfs_time_current_64())) {
                         /* If we have never tried this connection since the
                            the last successful attempt, go with this one */
-                        if (cfs_time_before_64(conn->oic_last_attempt,
+                        if (cfs_time_beforeq_64(conn->oic_last_attempt,
                                                imp->imp_last_success_conn)) {
                                 imp_conn = conn;
                                 break;
index 46f36ed..a61e502 100644 (file)
@@ -1193,6 +1193,52 @@ test_33() { # bug 12333
 }
 run_test 33 "Mount ost with a large index number"
 
+test_35() { # bug 12459
+       setup
+
+       DBG_SAVE="`sysctl -n lnet.debug`"
+       sysctl -w lnet.debug="ha"
+
+       log "Set up a fake failnode for the MDS"
+       FAKENID="127.0.0.2"
+       $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4
+
+       log "Wait for RECONNECT_INTERVAL seconds (10s)"
+       sleep 10
+
+       MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`"
+       $LCTL clear
+       log "$MSG"
+       log "Stopping the MDT:"
+       stop_mds || return 5
+
+       df $MOUNT > /dev/null 2>&1 &
+       DFPID=$!
+       log "Restarting the MDT:"
+       start_mds || return 6
+       log "Wait for df ($DFPID) ... "
+       wait $DFPID
+       log "done"
+       sysctl -w lnet.debug="$DBG_SAVE"
+
+       # retrieve from the log the first server that the client tried to
+       # contact after the connection loss
+       $LCTL dk $TMP/lustre-log-$TESTNAME.log
+       NEXTCONN=`awk "/${MSG}/ {start = 1;}
+                      /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+                               if (start) {
+                                       if (\\\$NF ~ /$FAKENID/)
+                                               print \\\$NF;
+                                       else
+                                               print 0;
+                                       exit;
+                               }
+                      }" $TMP/lustre-log-$TESTNAME.log`
+       [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
+       cleanup
+}
+run_test 35 "Reconnect to the last active server first"
+
 umount_client $MOUNT   
 cleanup_nocli
 cleanup_krb5_env