Whamcloud - gitweb
b=20615 print more debug info for timedout ZC-req
authorLiang Zhen <zhen.liang@sun.com>
Tue, 6 Apr 2010 08:48:51 +0000 (10:48 +0200)
committerJohann Lombardi <johann@sun.com>
Tue, 6 Apr 2010 08:48:51 +0000 (10:48 +0200)
i=maxim
i=isaac

1. output more information for timedout ZC-req and partial received connection
2. close connection for timedout ZC-req
3. always send ZC_ACK on non-blocking connection(BULK_IN)

lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_modparams.c

index 573e790..84d7ddc 100644 (file)
@@ -1550,6 +1550,8 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn)
                 LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
 
                 tx->tx_msg.ksm_zc_cookies[0] = 0;
+                if (tx->tx_resid == 0)
+                        tx->tx_resid = -1; /* mark it as not-acked */
                 list_del(&tx->tx_zc_list);
                 list_add(&tx->tx_zc_list, &zlist);
         }
@@ -1642,6 +1644,8 @@ ksocknal_queue_zombie_conn (ksock_conn_t *conn)
 void
 ksocknal_destroy_conn (ksock_conn_t *conn)
 {
+        cfs_time_t      last_rcv;
+
         /* Final coup-de-grace of the reaper */
         CDEBUG (D_NET, "connection %p\n", conn);
 
@@ -1656,10 +1660,16 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         /* complete current receive if any */
         switch (conn->ksnc_rx_state) {
         case SOCKNAL_RX_LNET_PAYLOAD:
-                CERROR("Completing partial receive from %s"
-                       ", ip %d.%d.%d.%d:%d, with error\n",
-                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
-                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                last_rcv = conn->ksnc_rx_deadline -
+                           cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+                CERROR("Completing partial receive from %s[%d]"
+                       ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+                       "last alive is %ld secs ago\n",
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+                       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+                                        last_rcv)));
                 lnet_finalize (conn->ksnc_peer->ksnp_ni,
                                conn->ksnc_cookie, -EIO);
                 break;
index 5c9b1b5..6a61b96 100644 (file)
@@ -120,6 +120,7 @@ typedef struct
         int              *ksnd_peertimeout;     /* seconds to consider peer dead */
         int              *ksnd_enable_csum;     /* enable check sum */
         int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+        int              *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
         unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
         int              *ksnd_zc_recv;         /* enable ZC receive (for Chelsio TOE) */
         int              *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
index 14b451f..f4b0174 100644 (file)
@@ -1270,8 +1270,9 @@ ksocknal_process_receive (ksock_conn_t *conn)
                         id   = &conn->ksnc_peer->ksnp_id;
 
                         rc = conn->ksnc_proto->pro_handle_zcreq(conn,
-                                               conn->ksnc_msg.ksm_zc_cookies[0],
-                                               le64_to_cpu(lhdr->src_nid) != id->nid);
+                                        conn->ksnc_msg.ksm_zc_cookies[0],
+                                        *ksocknal_tunables.ksnd_nonblk_zcack ||
+                                        le64_to_cpu(lhdr->src_nid) != id->nid);
                 }
 
                 lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
@@ -2335,6 +2336,7 @@ ksocknal_check_peer_timeouts (int idx)
         struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
         ksock_peer_t     *peer;
         ksock_conn_t     *conn;
+        ksock_tx_t       *tx;
 
  again:
         /* NB. We expect to have a look at all the peers and not find any
@@ -2343,6 +2345,10 @@ ksocknal_check_peer_timeouts (int idx)
         cfs_read_lock (&ksocknal_data.ksnd_global_lock);
 
         list_for_each_entry(peer, peers, ksnp_list) {
+                cfs_time_t  deadline = 0;
+                int         resid = 0;
+                int         n     = 0;
+
                 if (ksocknal_send_keepalive_locked(peer) != 0) {
                         read_unlock (&ksocknal_data.ksnd_global_lock);
                         goto again;
@@ -2380,29 +2386,45 @@ ksocknal_check_peer_timeouts (int idx)
                                 goto again;
                         }
                 }
-        }
 
-        /* print out warnings about stale ZC_REQs */
-        list_for_each_entry(peer, peers, ksnp_list) {
-                ksock_tx_t *tx;
-                int         n = 0;
+                if (list_empty(&peer->ksnp_zc_req_list))
+                        continue;
 
+                cfs_spin_lock(&peer->ksnp_lock);
                 list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
                         if (!cfs_time_aftereq(cfs_time_current(),
                                               tx->tx_deadline))
                                 break;
+                        /* ignore the TX if connection is being closed */
+                        if (tx->tx_conn->ksnc_closing)
+                                continue;
                         n++;
                 }
-
-                if (n != 0) {
-                        tx = list_entry (peer->ksnp_zc_req_list.next,
-                                         ksock_tx_t, tx_zc_list);
-                        CWARN("Stale ZC_REQs for peer %s detected: %d; the "
-                              "oldest (%p) timed out %ld secs ago\n",
-                              libcfs_nid2str(peer->ksnp_id.nid), n, tx,
-                              cfs_duration_sec(cfs_time_current() -
-                                               tx->tx_deadline));
+                if (n == 0) {
+                        cfs_spin_unlock(&peer->ksnp_lock);
+                        continue;
                 }
+
+                tx = list_entry(peer->ksnp_zc_req_list.next,
+                                ksock_tx_t, tx_zc_list);
+                deadline = tx->tx_deadline;
+                resid    = tx->tx_resid;
+                conn     = tx->tx_conn;
+                ksocknal_conn_addref(conn);
+
+                cfs_spin_unlock(&peer->ksnp_lock);
+                cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
+
+                CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+                       "oldest(%p) timed out %ld secs ago, "
+                       "resid: %d, wmem: %d\n",
+                       n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+                       cfs_duration_sec(cfs_time_current() - deadline),
+                       resid, libcfs_sock_wmem_queued(conn->ksnc_sock));
+
+                ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+                ksocknal_conn_decref(conn);
+                goto again;
         }
 
         cfs_read_unlock (&ksocknal_data.ksnd_global_lock);
index c4a6261..853c08e 100644 (file)
@@ -129,6 +129,10 @@ CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644,
                 "enable IRQ affinity");
 #endif
 
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+                "always send ZC-ACK on non-blocking connection");
+
 static unsigned int zc_min_payload = (16 << 10);
 CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
                 "minimum payload size to zero copy");
@@ -179,6 +183,7 @@ ksock_tunables_t ksocknal_tunables = {
         .ksnd_keepalive_intvl = &keepalive_intvl,
         .ksnd_enable_csum     = &enable_csum,
         .ksnd_inject_csum_error = &inject_csum_error,
+        .ksnd_nonblk_zcack    = &nonblk_zcack,
         .ksnd_zc_min_payload  = &zc_min_payload,
         .ksnd_zc_recv         = &zc_recv,
         .ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags,