Whamcloud - gitweb
b=14425
authorisaac <isaac>
Fri, 14 Mar 2008 18:18:06 +0000 (18:18 +0000)
committerisaac <isaac>
Fri, 14 Mar 2008 18:18:06 +0000 (18:18 +0000)
i=liangzhen, i=maxim
-   fixed a deadlock in o2iblnd/ptllnd credit flow.

lnet/ChangeLog
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/ptllnd/ptllnd.c
lnet/klnds/ptllnd/ptllnd.h
lnet/klnds/ptllnd/ptllnd_cb.c
lnet/klnds/ptllnd/ptllnd_peer.c
lnet/klnds/ptllnd/ptllnd_rx_buf.c

index 983b21b..6aff8fe 100644 (file)
@@ -11,6 +11,12 @@ tbd  Sun Microsystems, Inc.
        gmlnd     - GM 2.1.22 and later,
        mxlnd     - MX 1.2.1 or later,
        ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
        gmlnd     - GM 2.1.22 and later,
        mxlnd     - MX 1.2.1 or later,
        ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
+
+Severity   : major
+Bugzilla   : 14425
+Description: o2iblnd/ptllnd credit deadlock in a routed config.
+Details    : o2iblnd/ptllnd credit deadlock in a routed config.
+
 Severity   : normal
 Bugzilla   : 14956
 Description: High load after starting lnet
 Severity   : normal
 Bugzilla   : 14956
 Description: High load after starting lnet
index e5369ff..b8a994a 100644 (file)
@@ -598,6 +598,10 @@ kiblnd_debug_conn (kib_conn_t *conn)
         list_for_each(tmp, &conn->ibc_early_rxs)
                 kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
 
         list_for_each(tmp, &conn->ibc_early_rxs)
                 kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
 
+        CDEBUG(D_CONSOLE, "   tx_noops:\n");
+        list_for_each(tmp, &conn->ibc_tx_noops)
+                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
         CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
         list_for_each(tmp, &conn->ibc_tx_queue_nocred)
                 kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
         CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
         list_for_each(tmp, &conn->ibc_tx_queue_nocred)
                 kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
@@ -666,6 +670,7 @@ kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state)
         conn->ibc_cmid = cmid;
 
         INIT_LIST_HEAD(&conn->ibc_early_rxs);
         conn->ibc_cmid = cmid;
 
         INIT_LIST_HEAD(&conn->ibc_early_rxs);
+        INIT_LIST_HEAD(&conn->ibc_tx_noops);
         INIT_LIST_HEAD(&conn->ibc_tx_queue);
         INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
         INIT_LIST_HEAD(&conn->ibc_tx_queue);
         INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
@@ -741,9 +746,8 @@ kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state)
         memset(init_qp_attr, 0, sizeof(*init_qp_attr));
         init_qp_attr->event_handler = kiblnd_qp_event;
         init_qp_attr->qp_context = conn;
         memset(init_qp_attr, 0, sizeof(*init_qp_attr));
         init_qp_attr->event_handler = kiblnd_qp_event;
         init_qp_attr->qp_context = conn;
-        init_qp_attr->cap.max_send_wr = (*kiblnd_tunables.kib_concurrent_sends) *
-                                        (1 + IBLND_MAX_RDMA_FRAGS);
-        init_qp_attr->cap.max_recv_wr = IBLND_RX_MSGS;
+        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS;
+        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS;
         init_qp_attr->cap.max_send_sge = 1;
         init_qp_attr->cap.max_recv_sge = 1;
         init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
         init_qp_attr->cap.max_send_sge = 1;
         init_qp_attr->cap.max_recv_sge = 1;
         init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -850,6 +854,7 @@ kiblnd_destroy_conn (kib_conn_t *conn)
         LASSERT (!in_interrupt());
         LASSERT (atomic_read(&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_early_rxs));
         LASSERT (!in_interrupt());
         LASSERT (atomic_read(&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_early_rxs));
+        LASSERT (list_empty(&conn->ibc_tx_noops));
         LASSERT (list_empty(&conn->ibc_tx_queue));
         LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
         LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
         LASSERT (list_empty(&conn->ibc_tx_queue));
         LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
         LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
index 2ada24f..f7926c7 100644 (file)
@@ -98,13 +98,15 @@ typedef int gfp_t;
 #define IBLND_TX_MSG_PAGES()  ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
 #define IBLND_TX_MSG_PAGES()  ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define IBLND_RX_MSGS         (IBLND_MSG_QUEUE_SIZE*2)
+#define IBLND_RX_MSGS         (IBLND_MSG_QUEUE_SIZE * 2)
 #define IBLND_RX_MSG_BYTES    (IBLND_RX_MSGS * IBLND_MSG_SIZE)
 #define IBLND_RX_MSG_PAGES    ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 #define IBLND_RX_MSG_BYTES    (IBLND_RX_MSGS * IBLND_MSG_SIZE)
 #define IBLND_RX_MSG_PAGES    ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#define IBLND_CQ_ENTRIES()    (IBLND_RX_MSGS +                                  \
-                               (*kiblnd_tunables.kib_concurrent_sends) *        \
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS        IBLND_RX_MSGS
+#define IBLND_SEND_WRS        ((*kiblnd_tunables.kib_concurrent_sends) * \
                                (1 + IBLND_MAX_RDMA_FRAGS))
                                (1 + IBLND_MAX_RDMA_FRAGS))
+#define IBLND_CQ_ENTRIES()    (IBLND_RECV_WRS + IBLND_SEND_WRS)
 
 typedef struct
 {
 
 typedef struct
 {
@@ -393,6 +395,7 @@ typedef struct kib_conn
         int                 ibc_ready:1;        /* CQ callback fired */
         unsigned long       ibc_last_send;      /* time of last send */
         struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
         int                 ibc_ready:1;        /* CQ callback fired */
         unsigned long       ibc_last_send;      /* time of last send */
         struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
+        struct list_head    ibc_tx_noops;       /* IBLND_MSG_NOOPs */
         struct list_head    ibc_tx_queue;       /* sends that need a credit */
         struct list_head    ibc_tx_queue_nocred;/* sends that don't need a credit */
         struct list_head    ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
         struct list_head    ibc_tx_queue;       /* sends that need a credit */
         struct list_head    ibc_tx_queue_nocred;/* sends that don't need a credit */
         struct list_head    ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
@@ -507,6 +510,28 @@ kiblnd_send_keepalive(kib_conn_t *conn)
                            *kiblnd_tunables.kib_keepalive*HZ);
 }
 
                            *kiblnd_tunables.kib_keepalive*HZ);
 }
 
+static inline int
+kiblnd_send_noop(kib_conn_t *conn)
+{
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        if (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
+            !kiblnd_send_keepalive(conn))
+                return 0; /* No need to send NOOP */
+
+        if (!list_empty(&conn->ibc_tx_noops) ||       /* NOOP already queued */
+            !list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */
+            conn->ibc_credits == 0)                    /* no credit */
+                return 0;
+
+        if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+            conn->ibc_outstanding_credits == 0) /* giving back credits */
+                return 0;
+
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+        return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
 static inline void
 kiblnd_abort_receives(kib_conn_t *conn)
 {
 static inline void
 kiblnd_abort_receives(kib_conn_t *conn)
 {
index e4504ae..3b7d6c6 100644 (file)
@@ -376,6 +376,10 @@ kiblnd_handle_rx (kib_rx_t *rx)
 
                 conn->ibc_credits += credits;
 
 
                 conn->ibc_credits += credits;
 
+                /* This ensures the credit taken by NOOP can be returned */
+                if (msg->ibm_type == IBLND_MSG_NOOP)
+                        conn->ibc_outstanding_credits++;
+
                 spin_unlock(&conn->ibc_lock);
                 kiblnd_check_sends(conn);
         }
                 spin_unlock(&conn->ibc_lock);
                 kiblnd_check_sends(conn);
         }
@@ -389,7 +393,10 @@ kiblnd_handle_rx (kib_rx_t *rx)
                 break;
 
         case IBLND_MSG_NOOP:
                 break;
 
         case IBLND_MSG_NOOP:
-                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                if (credits != 0) /* credit already posted */
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                else              /* a keepalive NOOP */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
                 break;
 
         case IBLND_MSG_IMMEDIATE:
                 break;
 
         case IBLND_MSG_IMMEDIATE:
@@ -887,10 +894,7 @@ kiblnd_check_sends (kib_conn_t *conn)
                 conn->ibc_reserved_credits--;
         }
 
                 conn->ibc_reserved_credits--;
         }
 
-        if (list_empty(&conn->ibc_tx_queue) &&
-            list_empty(&conn->ibc_tx_queue_nocred) &&
-            (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER ||
-             kiblnd_send_keepalive(conn))) {
+        if (kiblnd_send_noop(conn)) {
                 spin_unlock(&conn->ibc_lock);
 
                 tx = kiblnd_get_idle_tx(ni);
                 spin_unlock(&conn->ibc_lock);
 
                 tx = kiblnd_get_idle_tx(ni);
@@ -904,13 +908,17 @@ kiblnd_check_sends (kib_conn_t *conn)
         }
 
         for (;;) {
         }
 
         for (;;) {
-                if (!list_empty (&conn->ibc_tx_queue_nocred)) {
-                        tx = list_entry (conn->ibc_tx_queue_nocred.next, 
-                                         kib_tx_t, tx_list);
+                if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        tx = list_entry(conn->ibc_tx_queue_nocred.next, 
+                                        kib_tx_t, tx_list);
                         consume_cred = 0;
                         consume_cred = 0;
-                } else if (!list_empty (&conn->ibc_tx_queue)) {
-                        tx = list_entry (conn->ibc_tx_queue.next,
-                                         kib_tx_t, tx_list);
+                } else if (!list_empty(&conn->ibc_tx_noops)) {
+                        tx = list_entry(conn->ibc_tx_noops.next,
+                                        kib_tx_t, tx_list);
+                        consume_cred = 1;
+                } else if (!list_empty(&conn->ibc_tx_queue)) {
+                        tx = list_entry(conn->ibc_tx_queue.next,
+                                        kib_tx_t, tx_list);
                         consume_cred = 1;
                 } else {
                         /* nothing to send right now */
                         consume_cred = 1;
                 } else {
                         /* nothing to send right now */
@@ -939,27 +947,25 @@ kiblnd_check_sends (kib_conn_t *conn)
                         if (conn->ibc_credits == 0) {   /* no credits */
                                 CDEBUG(D_NET, "%s: no credits\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         if (conn->ibc_credits == 0) {   /* no credits */
                                 CDEBUG(D_NET, "%s: no credits\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                                break;
+                                break; /* NB ibc_tx_queue_nocred checked */
                         }
 
                         }
 
-                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                        /* Last credit reserved for NOOP */
+                        if (conn->ibc_credits == 1 &&
+                            tx->tx_msg->ibm_type != IBLND_MSG_NOOP) {
                                 CDEBUG(D_NET, "%s: not using last credit\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
                                 CDEBUG(D_NET, "%s: not using last credit\n",
                                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
-                                break;
+                                break; /* NB ibc_tx_noops checked */
                         }
                 }
 
                         }
                 }
 
-                list_del (&tx->tx_list);
+                list_del(&tx->tx_list);
                 tx->tx_queued = 0;
 
                 /* NB don't drop ibc_lock before bumping tx_sending */
 
                 if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP &&
                 tx->tx_queued = 0;
 
                 /* NB don't drop ibc_lock before bumping tx_sending */
 
                 if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP &&
-                    (!list_empty(&conn->ibc_tx_queue) ||
-                     !list_empty(&conn->ibc_tx_queue_nocred) ||
-                     (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
-                      !kiblnd_send_keepalive(conn)))) {
+                    !kiblnd_send_noop(conn)) {
                         /* redundant NOOP */
                         spin_unlock(&conn->ibc_lock);
                         kiblnd_tx_done(ni, tx);
                         /* redundant NOOP */
                         spin_unlock(&conn->ibc_lock);
                         kiblnd_tx_done(ni, tx);
@@ -1304,6 +1310,9 @@ kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
                 break;
 
         case IBLND_MSG_NOOP:
                 break;
 
         case IBLND_MSG_NOOP:
+                q = &conn->ibc_tx_noops;
+                break;
+
         case IBLND_MSG_IMMEDIATE:
                 q = &conn->ibc_tx_queue;
                 break;
         case IBLND_MSG_IMMEDIATE:
                 q = &conn->ibc_tx_queue;
                 break;
@@ -1906,6 +1915,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 return; /* already being handled  */
 
         if (error == 0 &&
                 return; /* already being handled  */
 
         if (error == 0 &&
+            list_empty(&conn->ibc_tx_noops) &&
             list_empty(&conn->ibc_tx_queue) &&
             list_empty(&conn->ibc_tx_queue_rsrvd) &&
             list_empty(&conn->ibc_tx_queue_nocred) &&
             list_empty(&conn->ibc_tx_queue) &&
             list_empty(&conn->ibc_tx_queue_rsrvd) &&
             list_empty(&conn->ibc_tx_queue_nocred) &&
@@ -1913,9 +1923,10 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 CDEBUG(D_NET, "closing conn to %s\n", 
                        libcfs_nid2str(peer->ibp_nid));
         } else {
                 CDEBUG(D_NET, "closing conn to %s\n", 
                        libcfs_nid2str(peer->ibp_nid));
         } else {
-                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n",
+                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s%s\n",
                        libcfs_nid2str(peer->ibp_nid), error,
                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
                        libcfs_nid2str(peer->ibp_nid), error,
                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
                        list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
                        list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
@@ -2030,6 +2041,7 @@ kiblnd_finalise_conn (kib_conn_t *conn)
         /* Complete all tx descs not waiting for sends to complete.
          * NB we should be safe from RDMA now that the QP has changed state */
 
         /* Complete all tx descs not waiting for sends to complete.
          * NB we should be safe from RDMA now that the QP has changed state */
 
+        kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
@@ -2334,8 +2346,8 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
         /* conn now "owns" cmid, so I return success from here on to ensure the
          * CM callback doesn't destroy cmid. */
 
         /* conn now "owns" cmid, so I return success from here on to ensure the
          * CM callback doesn't destroy cmid. */
 
-        conn->ibc_incarnation = reqmsg->ibm_srcstamp;
-        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+        conn->ibc_credits          = IBLND_MSG_QUEUE_SIZE;
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
@@ -2544,8 +2556,8 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
                 goto failed;
         }
 
                 goto failed;
         }
 
-        conn->ibc_incarnation = msg->ibm_srcstamp;
-        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_incarnation      = msg->ibm_srcstamp;
+        conn->ibc_credits          = IBLND_MSG_QUEUE_SIZE;
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
         conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
         LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
                  <= IBLND_RX_MSGS);
@@ -2808,6 +2820,7 @@ int
 kiblnd_conn_timed_out (kib_conn_t *conn)
 {
         return  kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
 kiblnd_conn_timed_out (kib_conn_t *conn)
 {
         return  kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs(conn, &conn->ibc_tx_noops) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
                 kiblnd_check_txs(conn, &conn->ibc_active_txs);
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
                 kiblnd_check_txs(conn, &conn->ibc_active_txs);
index f020fac..0a55687 100755 (executable)
@@ -475,6 +475,12 @@ kptllnd_startup (lnet_ni_t *ni)
                 return -EINVAL;
         }
 
                 return -EINVAL;
         }
 
+        /* kptl_msg_t::ptlm_credits is only a __u8 */
+        if (*kptllnd_tunables.kptl_peercredits > 255) {
+                CERROR("kptl_peercredits must be <= 255\n");
+                return -EINVAL;
+        }
+
         *kptllnd_tunables.kptl_max_msg_size &= ~7;
         if (*kptllnd_tunables.kptl_max_msg_size < PTLLND_MIN_BUFFER_SIZE)
                 *kptllnd_tunables.kptl_max_msg_size = PTLLND_MIN_BUFFER_SIZE;
         *kptllnd_tunables.kptl_max_msg_size &= ~7;
         if (*kptllnd_tunables.kptl_max_msg_size < PTLLND_MIN_BUFFER_SIZE)
                 *kptllnd_tunables.kptl_max_msg_size = PTLLND_MIN_BUFFER_SIZE;
index 2e6e8a4..b1d4360 100755 (executable)
@@ -136,6 +136,10 @@ typedef struct kptl_rx                          /* receive message */
         char                    rx_space[0];    /* copy of incoming request */
 } kptl_rx_t;
 
         char                    rx_space[0];    /* copy of incoming request */
 } kptl_rx_t;
 
+#define PTLLND_POSTRX_DONT_POST    0            /* don't post */
+#define PTLLND_POSTRX_NO_CREDIT    1            /* post: no credits */
+#define PTLLND_POSTRX_PEER_CREDIT  2            /* post: give peer back 1 credit */
+
 typedef struct kptl_rx_buffer_pool
 {
         spinlock_t              rxbp_lock;
 typedef struct kptl_rx_buffer_pool
 {
         spinlock_t              rxbp_lock;
@@ -217,6 +221,7 @@ struct kptl_peer
         atomic_t                peer_refcount;          /* The current refrences */
         enum kptllnd_peer_state peer_state;
         spinlock_t              peer_lock;              /* serialize */
         atomic_t                peer_refcount;          /* The current refrences */
         enum kptllnd_peer_state peer_state;
         spinlock_t              peer_lock;              /* serialize */
+        struct list_head        peer_noops;             /* PTLLND_MSG_TYPE_NOOP txs */
         struct list_head        peer_sendq;             /* txs waiting for mh handles */
         struct list_head        peer_activeq;           /* txs awaiting completion */
         lnet_process_id_t       peer_id;                /* Peer's LNET id */
         struct list_head        peer_sendq;             /* txs waiting for mh handles */
         struct list_head        peer_activeq;           /* txs awaiting completion */
         lnet_process_id_t       peer_id;                /* Peer's LNET id */
@@ -401,8 +406,8 @@ kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb)
 /*
  * RX SUPPORT FUNCTIONS
  */
 /*
  * RX SUPPORT FUNCTIONS
  */
-void kptllnd_rx_done(kptl_rx_t *rx);
 void kptllnd_rx_parse(kptl_rx_t *rx);
 void kptllnd_rx_parse(kptl_rx_t *rx);
+void kptllnd_rx_done(kptl_rx_t *rx, int post_credit);
 
 /*
  * PEER SUPPORT FUNCTIONS
 
 /*
  * PEER SUPPORT FUNCTIONS
index ed3bb95..1903fc6 100644 (file)
@@ -598,7 +598,7 @@ kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
         /*
          * We're done with the RX
          */
         /*
          * We're done with the RX
          */
-        kptllnd_rx_done(rx);
+        kptllnd_rx_done(rx, PTLLND_POSTRX_PEER_CREDIT);
         return rc;
 }
 
         return rc;
 }
 
index 98e174f..f4e67f4 100644 (file)
@@ -158,6 +158,7 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid)
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
+        INIT_LIST_HEAD (&peer->peer_noops);
         INIT_LIST_HEAD (&peer->peer_sendq);
         INIT_LIST_HEAD (&peer->peer_activeq);
         spin_lock_init (&peer->peer_lock);
         INIT_LIST_HEAD (&peer->peer_sendq);
         INIT_LIST_HEAD (&peer->peer_activeq);
         spin_lock_init (&peer->peer_lock);
@@ -205,6 +206,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer)
         LASSERT (atomic_read(&peer->peer_refcount) == 0);
         LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
                  peer->peer_state == PEER_STATE_ZOMBIE);
         LASSERT (atomic_read(&peer->peer_refcount) == 0);
         LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
                  peer->peer_state == PEER_STATE_ZOMBIE);
+        LASSERT (list_empty(&peer->peer_noops));
         LASSERT (list_empty(&peer->peer_sendq));
         LASSERT (list_empty(&peer->peer_activeq));
 
         LASSERT (list_empty(&peer->peer_sendq));
         LASSERT (list_empty(&peer->peer_activeq));
 
@@ -245,6 +247,7 @@ kptllnd_peer_cancel_txs(kptl_peer_t *peer, struct list_head *txs)
 
         spin_lock_irqsave(&peer->peer_lock, flags);
 
 
         spin_lock_irqsave(&peer->peer_lock, flags);
 
+        kptllnd_cancel_txlist(&peer->peer_noops, txs);
         kptllnd_cancel_txlist(&peer->peer_sendq, txs);
         kptllnd_cancel_txlist(&peer->peer_activeq, txs);
                 
         kptllnd_cancel_txlist(&peer->peer_sendq, txs);
         kptllnd_cancel_txlist(&peer->peer_activeq, txs);
                 
@@ -519,7 +522,9 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
         tx->tx_msg_mdh = msg_mdh;
 
        /* Ensure HELLO is sent first */
         tx->tx_msg_mdh = msg_mdh;
 
        /* Ensure HELLO is sent first */
-       if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
+        if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP)
+               list_add(&tx->tx_list, &peer->peer_noops);
+       else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
                list_add(&tx->tx_list, &peer->peer_sendq);
        else
                list_add_tail(&tx->tx_list, &peer->peer_sendq);
                list_add(&tx->tx_list, &peer->peer_sendq);
        else
                list_add_tail(&tx->tx_list, &peer->peer_sendq);
@@ -527,12 +532,26 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag)
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 }
 
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 }
 
+static inline int
+kptllnd_peer_send_noop (kptl_peer_t *peer)
+{
+        if (!peer->peer_sent_hello ||
+            peer->peer_credits == 0 ||
+            !list_empty(&peer->peer_noops) ||
+            peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)
+                return 0;
+
+        /* No tx to piggyback NOOP onto or no credit to send a tx */
+        return (list_empty(&peer->peer_sendq) || peer->peer_credits == 1);
+}
+
 void
 kptllnd_peer_check_sends (kptl_peer_t *peer)
 {
         ptl_handle_me_t  meh;
         kptl_tx_t       *tx;
         int              rc;
 void
 kptllnd_peer_check_sends (kptl_peer_t *peer)
 {
         ptl_handle_me_t  meh;
         kptl_tx_t       *tx;
         int              rc;
+        int              msg_type;
         unsigned long    flags;
 
         LASSERT(!in_interrupt());
         unsigned long    flags;
 
         LASSERT(!in_interrupt());
@@ -541,10 +560,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
 
         peer->peer_retry_noop = 0;
 
 
         peer->peer_retry_noop = 0;
 
-        if (list_empty(&peer->peer_sendq) &&
-            peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER &&
-            peer->peer_credits != 0) {
-
+        if (kptllnd_peer_send_noop(peer)) {
                 /* post a NOOP to return credits */
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
                 /* post a NOOP to return credits */
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
@@ -561,8 +577,18 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                 peer->peer_retry_noop = (tx == NULL);
         }
 
                 peer->peer_retry_noop = (tx == NULL);
         }
 
-        while (!list_empty(&peer->peer_sendq)) {
-                tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list);
+        for (;;) {
+                if (!list_empty(&peer->peer_noops)) {
+                        LASSERT (peer->peer_sent_hello);
+                        tx = list_entry(peer->peer_noops.next,
+                                        kptl_tx_t, tx_list);
+                } else if (!list_empty(&peer->peer_sendq)) {
+                        tx = list_entry(peer->peer_sendq.next,
+                                        kptl_tx_t, tx_list);
+                } else {
+                        /* nothing to send right now */
+                        break;
+                }
 
                 LASSERT (tx->tx_active);
                 LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
 
                 LASSERT (tx->tx_active);
                 LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
@@ -575,32 +601,37 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                          *kptllnd_tunables.kptl_peercredits);
                 LASSERT (peer->peer_credits >= 0);
 
                          *kptllnd_tunables.kptl_peercredits);
                 LASSERT (peer->peer_credits >= 0);
 
-               /* Ensure HELLO is sent first */
-               if (!peer->peer_sent_hello) {
-                       if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO)
-                               break;
-                       peer->peer_sent_hello = 1;
-               }
+                msg_type = tx->tx_msg->ptlm_type;
+
+                /* Ensure HELLO is sent first */
+                if (!peer->peer_sent_hello) {
+                        LASSERT (list_empty(&peer->peer_noops));
+                        if (msg_type != PTLLND_MSG_TYPE_HELLO)
+                                break;
+                        peer->peer_sent_hello = 1;
+                }
 
                 if (peer->peer_credits == 0) {
 
                 if (peer->peer_credits == 0) {
-                        CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %p\n",
+                        CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %s[%p]\n",
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits, 
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits, 
-                               peer->peer_sent_credits, tx);
+                               peer->peer_sent_credits, 
+                               kptllnd_msgtype2str(msg_type), tx);
                         break;
                 }
 
                         break;
                 }
 
-                /* Don't use the last credit unless I've got credits to
-                 * return */
+                /* Last/Initial credit reserved for NOOP/HELLO */
                 if (peer->peer_credits == 1 &&
                 if (peer->peer_credits == 1 &&
-                    peer->peer_outstanding_credits == 0) {
+                    msg_type != PTLLND_MSG_TYPE_HELLO &&
+                    msg_type != PTLLND_MSG_TYPE_NOOP) {
                         CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: "
                         CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: "
-                               "not using last credit for %p\n",
+                               "not using last credit for %s[%p]\n",
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits,
                                libcfs_id2str(peer->peer_id), 
                                peer->peer_credits,
                                peer->peer_outstanding_credits,
-                               peer->peer_sent_credits, tx);
+                               peer->peer_sent_credits,
+                               kptllnd_msgtype2str(msg_type), tx);
                         break;
                 }
 
                         break;
                 }
 
@@ -608,10 +639,8 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
 
                 /* Discard any NOOP I queued if I'm not at the high-water mark
                  * any more or more messages have been queued */
 
                 /* Discard any NOOP I queued if I'm not at the high-water mark
                  * any more or more messages have been queued */
-                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
-                    (!list_empty(&peer->peer_sendq) ||
-                     peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
-
+                if (msg_type == PTLLND_MSG_TYPE_NOOP &&
+                    !kptllnd_peer_send_noop(peer)) {
                         tx->tx_active = 0;
 
                         spin_unlock_irqrestore(&peer->peer_lock, flags);
                         tx->tx_active = 0;
 
                         spin_unlock_irqrestore(&peer->peer_lock, flags);
@@ -636,7 +665,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                         tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
                                 peer->peer_next_matchbits++;
                 }
                         tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
                                 peer->peer_next_matchbits++;
                 }
-                
+
                 peer->peer_sent_credits += peer->peer_outstanding_credits;
                 peer->peer_outstanding_credits = 0;
                 peer->peer_credits--;
                 peer->peer_sent_credits += peer->peer_outstanding_credits;
                 peer->peer_outstanding_credits = 0;
                 peer->peer_credits--;
@@ -644,8 +673,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer)
                 CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n",
                        libcfs_id2str(peer->peer_id), peer->peer_credits,
                        peer->peer_outstanding_credits, peer->peer_sent_credits,
                 CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n",
                        libcfs_id2str(peer->peer_id), peer->peer_credits,
                        peer->peer_outstanding_credits, peer->peer_sent_credits,
-                       kptllnd_msgtype2str(tx->tx_msg->ptlm_type),
-                       tx, tx->tx_msg->ptlm_nob,
+                       kptllnd_msgtype2str(msg_type), tx, tx->tx_msg->ptlm_nob,
                        tx->tx_msg->ptlm_credits);
 
                 list_add_tail(&tx->tx_list, &peer->peer_activeq);
                        tx->tx_msg->ptlm_credits);
 
                 list_add_tail(&tx->tx_list, &peer->peer_activeq);
index 847e265..356660c 100644 (file)
@@ -331,12 +331,15 @@ kptllnd_rx_alloc(void)
 }
 
 void
 }
 
 void
-kptllnd_rx_done(kptl_rx_t *rx)
+kptllnd_rx_done(kptl_rx_t *rx, int post_credit)
 {
         kptl_rx_buffer_t *rxb = rx->rx_rxb;
         kptl_peer_t      *peer = rx->rx_peer;
         unsigned long     flags;
 
 {
         kptl_rx_buffer_t *rxb = rx->rx_rxb;
         kptl_peer_t      *peer = rx->rx_peer;
         unsigned long     flags;
 
+        LASSERT (post_credit == PTLLND_POSTRX_NO_CREDIT ||
+                 post_credit == PTLLND_POSTRX_PEER_CREDIT);
+
         CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer);
 
         if (rxb != NULL)
         CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer);
 
         if (rxb != NULL)
@@ -346,7 +349,9 @@ kptllnd_rx_done(kptl_rx_t *rx)
                 /* Update credits (after I've decref-ed the buffer) */
                 spin_lock_irqsave(&peer->peer_lock, flags);
 
                 /* Update credits (after I've decref-ed the buffer) */
                 spin_lock_irqsave(&peer->peer_lock, flags);
 
-                peer->peer_outstanding_credits++;
+                if (post_credit == PTLLND_POSTRX_PEER_CREDIT)
+                        peer->peer_outstanding_credits++;
+
                 LASSERT (peer->peer_outstanding_credits +
                          peer->peer_sent_credits <=
                          *kptllnd_tunables.kptl_peercredits);
                 LASSERT (peer->peer_outstanding_credits +
                          peer->peer_sent_credits <=
                          *kptllnd_tunables.kptl_peercredits);
@@ -515,6 +520,7 @@ void
 kptllnd_rx_parse(kptl_rx_t *rx)
 {
         kptl_msg_t             *msg = rx->rx_msg;
 kptllnd_rx_parse(kptl_rx_t *rx)
 {
         kptl_msg_t             *msg = rx->rx_msg;
+        int                     post_credit = PTLLND_POSTRX_PEER_CREDIT;
         kptl_peer_t            *peer;
         int                     rc;
         unsigned long           flags;
         kptl_peer_t            *peer;
         int                     rc;
         unsigned long           flags;
@@ -642,7 +648,7 @@ kptllnd_rx_parse(kptl_rx_t *rx)
                 int  c = peer->peer_credits;
                 int oc = peer->peer_outstanding_credits;
                 int sc = peer->peer_sent_credits;
                 int  c = peer->peer_credits;
                 int oc = peer->peer_outstanding_credits;
                 int sc = peer->peer_sent_credits;
-                
+
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
                 CERROR("%s: buffer overrun [%d/%d+%d]\n",
                 spin_unlock_irqrestore(&peer->peer_lock, flags);
 
                 CERROR("%s: buffer overrun [%d/%d+%d]\n",
@@ -655,6 +661,12 @@ kptllnd_rx_parse(kptl_rx_t *rx)
          * buffers after the startup handshake. */
         peer->peer_credits += msg->ptlm_credits;
 
          * buffers after the startup handshake. */
         peer->peer_credits += msg->ptlm_credits;
 
+        /* This ensures the credit taken by NOOP can be returned */
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_NOOP) {
+                peer->peer_outstanding_credits++;
+                post_credit = PTLLND_POSTRX_NO_CREDIT;
+        }
+
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 
         /* See if something can go out now that credits have come in */
         spin_unlock_irqrestore(&peer->peer_lock, flags);
 
         /* See if something can go out now that credits have come in */
@@ -723,5 +735,5 @@ kptllnd_rx_parse(kptl_rx_t *rx)
         if (rx->rx_peer == NULL)                /* drop ref on peer */
                 kptllnd_peer_decref(peer);      /* unless rx_done will */
  rx_done:
         if (rx->rx_peer == NULL)                /* drop ref on peer */
                 kptllnd_peer_decref(peer);      /* unless rx_done will */
  rx_done:
-        kptllnd_rx_done(rx);
+        kptllnd_rx_done(rx, post_credit);
 }
 }