From e368c609fac2e7442f71a8456d062cf2e86a1ae2 Mon Sep 17 00:00:00 2001 From: eeb Date: Wed, 2 Mar 2005 19:10:12 +0000 Subject: [PATCH] * Fixed 5708: openib reconnect on client reboot issues --- lnet/klnds/openiblnd/openiblnd.c | 5 ++++- lnet/klnds/openiblnd/openiblnd_cb.c | 42 +++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index 9e6ca58..480c5aa 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -1001,6 +1001,7 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ INIT_LIST_HEAD (&peer->ibp_conns); INIT_LIST_HEAD (&peer->ibp_tx_queue); + INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */ peer->ibp_reconnect_time = jiffies; peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; @@ -1020,6 +1021,7 @@ kibnal_destroy_peer (kib_peer_t *peer) LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); + LASSERT (list_empty (&peer->ibp_connd_list)); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); @@ -1510,7 +1512,8 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) if (conn->ibc_incarnation == incarnation) continue; - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", + CDEBUG(D_NET, "Closing stale conn %p nid:"LPX64 + " incarnation:"LPX64"("LPX64")\n", conn, peer->ibp_nid, conn->ibc_incarnation, incarnation); count++; diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index 9c7116e..3238b0e 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -916,6 +916,24 @@ kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) } void +kibnal_schedule_active_connect_locked (kib_peer_t *peer) +{ + /* Called with exclusive kib_global_lock */ + + peer->ibp_connecting++; + atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ + + spin_lock (&kibnal_data.kib_connd_lock); + + LASSERT (list_empty(&peer->ibp_connd_list)); + list_add_tail (&peer->ibp_connd_list, + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); +} + +void kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { unsigned long flags; @@ -984,16 +1002,7 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) return; } - peer->ibp_connecting = 1; - atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); + kibnal_schedule_active_connect_locked(peer); } /* A connection is being established; queue the message... */ @@ -1527,7 +1536,7 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) peer->ibp_connecting--; if (peer->ibp_connecting != 0) { - /* another connection attempt under way (loopback?)... */ + /* another connection attempt under way... */ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return; } @@ -1977,7 +1986,8 @@ kibnal_active_conn_callback (tTS_IB_CM_EVENT event, void *param, void *arg) { - kib_conn_t *conn = arg; + kib_conn_t *conn = arg; + unsigned long flags; switch (event) { case TS_IB_CM_REP_RECEIVED: { @@ -2036,7 +2046,13 @@ kibnal_active_conn_callback (tTS_IB_CM_EVENT event, case TS_IB_CM_IDLE: CERROR("Connection %p -> "LPX64" IDLE\n", conn, conn->ibc_peer->ibp_nid); - /* Back out state change: I'm disengaged from CM */ + /* I assume this connection attempt was rejected because the + * peer found a stale QP; I'll just try again */ + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + kibnal_schedule_active_connect_locked(conn->ibc_peer); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + /* Back out state change: this conn disengaged from CM */ conn->ibc_state = IBNAL_CONN_INIT_QP; kibnal_connreq_done (conn, 1, -ECONNABORTED); -- 1.8.3.1