Whamcloud - gitweb
LU-290 Reconnects are not throttled
authorLai Siyao <laisiyao@whamcloud.com>
Wed, 15 Jun 2011 02:52:55 +0000 (19:52 -0700)
committerOleg Drokin <green@whamcloud.com>
Thu, 4 Aug 2011 06:31:51 +0000 (02:31 -0400)
bz22423
Client should not try to reconnect to the same nid in a busy loop,
but instead rely on pinger to issue reconnect.

Signed-off-by: Lai Siyao <laisiyao@whamcloud.com>
Change-Id: I878093030d62173071192ca816e037464cc9248d
Reviewed-on: http://review.whamcloud.com/944
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
Reviewed-by: hongchao.zhang <hongchao.zhang@whamcloud.com>
Tested-by: hongchao.zhang <hongchao.zhang@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ptlrpc/import.c
lustre/tests/conf-sanity.sh

index ef403ce..0934764 100644 (file)
@@ -479,12 +479,6 @@ static int import_select_connection(struct obd_import *imp)
                        imp->imp_obd->obd_name,
                        libcfs_nid2str(conn->oic_conn->c_peer.nid),
                        conn->oic_last_attempt);
-                /* Don't thrash connections */
-                if (cfs_time_before_64(cfs_time_current_64(),
-                                     conn->oic_last_attempt +
-                                     cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
-                        continue;
-                }
 
                 /* If we have not tried this connection since
                    the last successful attempt, go with this one */
@@ -735,42 +729,12 @@ EXPORT_SYMBOL(ptlrpc_connect_import);
 static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
 {
 #ifdef __KERNEL__
-        struct obd_import_conn *imp_conn;
-#endif
-        int wake_pinger = 0;
-
-        ENTRY;
-
-        cfs_spin_lock(&imp->imp_lock);
-        if (cfs_list_empty(&imp->imp_conn_list))
-                GOTO(unlock, 0);
-
-#ifdef __KERNEL__
-        imp_conn = cfs_list_entry(imp->imp_conn_list.prev,
-                                  struct obd_import_conn,
-                                  oic_item);
-
-        /* XXX: When the failover node is the primary node, it is possible
-         * to have two identical connections in imp_conn_list. We must
-         * compare not conn's pointers but NIDs, otherwise we can defeat
-         * connection throttling. (See bug 14774.) */
-        if (imp->imp_conn_current->oic_conn->c_peer.nid !=
-                                imp_conn->oic_conn->c_peer.nid) {
-                ptlrpc_ping_import_soon(imp);
-                wake_pinger = 1;
-        }
+        /* the pinger takes care of issuing the next reconnect request */
+        return;
 #else
         /* liblustre has no pinger thread, so we wakeup pinger anyway */
-        wake_pinger = 1;
+        ptlrpc_pinger_wake_up();
 #endif
-
- unlock:
-        cfs_spin_unlock(&imp->imp_lock);
-
-        if (wake_pinger)
-                ptlrpc_pinger_wake_up();
-
-        EXIT;
 }
 
 static int ptlrpc_busy_reconnect(int rc)
index 990b215..c306f4a 100644 (file)
@@ -1493,19 +1493,21 @@ test_35b() { # bug 18674
                at_max_set 0 mds client
        fi
 
-       mkdir -p $MOUNT/testdir
-       touch $MOUNT/testdir/test
+       mkdir -p $MOUNT/$tdir
 
        log "Injecting EBUSY on MDS"
        # Setting OBD_FAIL_MDS_RESEND=0x136
        do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x80000136" || return 2
 
-       log "Stat on a test file"
-       stat $MOUNT/testdir/test
+       $LCTL set_param mdc.${FSNAME}*.stats=clear
+  
+       log "Creating a test file and stat it"
+       touch $MOUNT/$tdir/$tfile
+       stat $MOUNT/$tdir/$tfile
 
        log "Stop injecting EBUSY on MDS"
        do_facet $SINGLEMDS "$LCTL set_param fail_loc=0" || return 3
-       rm -f $MOUNT/testdir/test
+       rm -f $MOUNT/$tdir/$tfile
 
        log "done"
        # restore adaptive timeout
@@ -1513,6 +1515,8 @@ test_35b() { # bug 18674
 
        $LCTL dk $TMP/lustre-log-$TESTNAME.log
 
+       CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'`
+
        # retrieve from the log if the client has ever tried to
        # contact the fake server after the loss of connection
        FAILCONN=`awk "BEGIN {ret = 0;}
@@ -1532,7 +1536,14 @@ test_35b() { # bug 18674
                log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
                return 5
 
-        cleanup
+       # LU-290
+       # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout
+       # Reconnects are supposed to be rate limited to one every 5s
+       [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \
+               log "ERROR: Too many reconnects $CONNCNT" && \
+               return 6
+
+       cleanup
        # remove nid settings
        writeconf
 }