Whamcloud - gitweb
Branch b1_6
authorjohann <johann>
Thu, 30 Aug 2007 16:16:35 +0000 (16:16 +0000)
committerjohann <johann>
Thu, 30 Aug 2007 16:16:35 +0000 (16:16 +0000)
b=12459
i=adilger,tianzy
i=scjody

Severity   : normal
Bugzilla   : 12459
Description: Client eviction due to failover config
Details    : after a connection loss, the lustre client should attempt to
     reconnect to the last active server first before trying the
     other potential connections.

lustre/ChangeLog
lustre/ptlrpc/import.c
lustre/tests/conf-sanity.sh

index 9ac9f75..d7eca9d 100644 (file)
@@ -128,6 +128,13 @@ Description: testing performance impact of enabling checksumming
 Details    : enable checksum by default, allow --disable-checksum 
              configure option and "-o nochecksum" mount option 
 
+Severity   : normal
+Bugzilla   : 12459
+Description: Client eviction due to failover config
+Details    : after a connection loss, the lustre client should attempt to
+            reconnect to the last active server first before trying the
+            other potential connections.
+
 --------------------------------------------------------------------------------
 
 2007-08-27         Cluster File Systems, Inc. <info@clusterfs.com>
index dc22564..66adf40 100644 (file)
@@ -279,7 +279,7 @@ static int import_select_connection(struct obd_import *imp)
                                        cfs_time_current_64())) {
                         /* If we have never tried this connection since the
                            the last successful attempt, go with this one */
-                        if (cfs_time_before_64(conn->oic_last_attempt,
+                        if (cfs_time_beforeq_64(conn->oic_last_attempt,
                                                imp->imp_last_success_conn)) {
                                 imp_conn = conn;
                                 break;
index 02afc82..f053663 100644 (file)
@@ -1254,5 +1254,51 @@ test_34c() {
 }
 run_test 34c "force umount with failed mds should be normal"
 
+test_35() { # bug 12459
+       setup
+
+       debugsave
+       sysctl -w lnet.debug="ha"
+
+       log "Set up a fake failnode for the MDS"
+       FAKENID="127.0.0.2"
+       $LCTL conf_param ${FSNAME}-MDT0000.failover.node=$FAKENID || return 4
+
+       log "Wait for RECONNECT_INTERVAL seconds (10s)"
+       sleep 10
+
+       MSG="conf-sanity.sh test_33 `date +%F%kh%Mm%Ss`"
+       $LCTL clear
+       log "$MSG"
+       log "Stopping the MDT:"
+       stop_mds || return 5
+
+       df $MOUNT > /dev/null 2>&1 &
+       DFPID=$!
+       log "Restarting the MDT:"
+       start_mds || return 6
+       log "Wait for df ($DFPID) ... "
+       wait $DFPID
+       log "done"
+       debugrestore
+
+       # retrieve from the log the first server that the client tried to
+       # contact after the connection loss
+       $LCTL dk $TMP/lustre-log-$TESTNAME.log
+       NEXTCONN=`awk "/${MSG}/ {start = 1;}
+                      /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+                               if (start) {
+                                       if (\\\$NF ~ /$FAKENID/)
+                                               print \\\$NF;
+                                       else
+                                               print 0;
+                                       exit;
+                               }
+                      }" $TMP/lustre-log-$TESTNAME.log`
+       [ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
+       cleanup
+}
+run_test 35 "Reconnect to the last active server first"
+
 equals_msg "Done"
 echo "$0: completed"