Revert "b=22423 Reconnects are not throttled"

author Mikhail Pershin <tappro@sun.com>

Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)

committer Mikhail Pershin <tappro@sun.com>

Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)
author Mikhail Pershin <tappro@sun.com>
Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)
committer Mikhail Pershin <tappro@sun.com>
Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 272d86c..1a461f1 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -476,6 +476,12 @@ static int import_select_connection(struct obd_import *imp)
                         imp->imp_obd->obd_name,
                         libcfs_nid2str(conn->oic_conn->c_peer.nid),
                         conn->oic_last_attempt);
+                /* Don't thrash connections */
+                if (cfs_time_before_64(cfs_time_current_64(),
+                                     conn->oic_last_attempt +
+                                     cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
+                        continue;
+                }
  
                  /* If we have not tried this connection since
                     the last successful attempt, go with this one */
@@ -726,12 +732,42 @@ EXPORT_SYMBOL(ptlrpc_connect_import);
  static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
  {
  #ifdef __KERNEL__
-        /* the pinger takes care of issuing the next reconnect request */
-        return;
+        struct obd_import_conn *imp_conn;
+#endif
+        int wake_pinger = 0;
+
+        ENTRY;
+
+        cfs_spin_lock(&imp->imp_lock);
+        if (cfs_list_empty(&imp->imp_conn_list))
+                GOTO(unlock, 0);
+
+#ifdef __KERNEL__
+        imp_conn = cfs_list_entry(imp->imp_conn_list.prev,
+                                  struct obd_import_conn,
+                                  oic_item);
+
+        /* XXX: When the failover node is the primary node, it is possible
+         * to have two identical connections in imp_conn_list. We must
+         * compare not conn's pointers but NIDs, otherwise we can defeat
+         * connection throttling. (See bug 14774.) */
+        if (imp->imp_conn_current->oic_conn->c_peer.nid !=
+                                imp_conn->oic_conn->c_peer.nid) {
+                ptlrpc_ping_import_soon(imp);
+                wake_pinger = 1;
+        }
  #else
          /* liblustre has no pinger thread, so we wakeup pinger anyway */
-        ptlrpc_pinger_wake_up();
+        wake_pinger = 1;
  #endif
+
+ unlock:
+        cfs_spin_unlock(&imp->imp_lock);
+
+        if (wake_pinger)
+                ptlrpc_pinger_wake_up();
+
+        EXIT;
  }
  
  static int ptlrpc_busy_reconnect(int rc)
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index f4d8546..055302d 100644 (file)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -1374,21 +1374,19 @@ test_35b() { # bug 18674
                 at_max_set 0 mds client
         fi
  
-       mkdir -p $MOUNT/$tdir
+       mkdir -p $MOUNT/testdir
+       touch $MOUNT/testdir/test
  
         log "Injecting EBUSY on MDS"
         # Setting OBD_FAIL_MDS_RESEND=0x136
         do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2
  
-       $LCTL set_param mdc.${FSNAME}*.stats=clear
-
-       log "Creating a test file and stat it"
-       touch $MOUNT/$tdir/$tfile
-       stat $MOUNT/$tdir/$tfile
+       log "Stat on a test file"
+       stat $MOUNT/testdir/test
  
         log "Stop injecting EBUSY on MDS"
         do_facet mds "$LCTL set_param fail_loc=0" || return 3
-       rm -f $MOUNT/$tdir/$tfile
+       rm -f $MOUNT/testdir/test
  
         log "done"
         # restore adaptive timeout
@@ -1396,8 +1394,6 @@ test_35b() { # bug 18674
  
         $LCTL dk $TMP/lustre-log-$TESTNAME.log
  
-       CONNCNT=`$LCTL get_param mdc.${FSNAME}*.stats | awk '/mds_connect/{print $2}'`
-
         # retrieve from the log if the client has ever tried to
         # contact the fake server after the loss of connection
         FAILCONN=`awk "BEGIN {ret = 0;}
@@ -1417,12 +1413,6 @@ test_35b() { # bug 18674
                 log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
                 return 5
  
-       # When OBD_FAIL_MDS_RESEND is hit, we sleep for 2 * obd_timeout
-        # Reconnects are supposed to be rate limited to one every 5s
-       [ $CONNCNT -gt $((2 * $TIMEOUT / 5 + 1)) ] && \
-               log "ERROR: Too many reconnects $CONNCNT" && \
-               return 6
-
          cleanup
         # remove nid settings
         writeconf
author	Mikhail Pershin <tappro@sun.com>
	Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)
committer	Mikhail Pershin <tappro@sun.com>
	Mon, 30 Aug 2010 21:17:09 +0000 (01:17 +0400)
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/tests/conf-sanity.sh		patch \| blob \| history