From 46abd986a9829ce37845002fa6667233cca399f6 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 17 Dec 2009 00:07:41 +0800 Subject: [PATCH] b=21396 LNET soft lockups in socknal_connd thread don't hog CPU for active-connecting if another connd is accepting connecting-requst from the same peer i=isaac i=maxim --- lnet/klnds/socklnd/socklnd.c | 8 ++++---- lnet/klnds/socklnd/socklnd_cb.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index e5f3402..573e790 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1804,7 +1804,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) rwlock_t *glock = &ksocknal_data.ksnd_global_lock; lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; - read_lock(glock); + cfs_read_lock(glock); peer = ksocknal_find_peer_locked(ni, id); if (peer != NULL) { @@ -1830,7 +1830,7 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) connect = 0; } - read_unlock(glock); + cfs_read_unlock(glock); if (last_alive != 0) *when = last_alive; @@ -1840,13 +1840,13 @@ ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); - write_lock_bh(glock); + cfs_write_lock_bh(glock); peer = ksocknal_find_peer_locked(ni, id); if (peer != NULL) ksocknal_launch_all_connections_locked(peer); - write_unlock_bh(glock); + cfs_write_unlock_bh(glock); return; } diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 89041f8..e718351 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1861,7 +1861,7 @@ ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, return 0; } -void +int ksocknal_connect (ksock_route_t *route) { CFS_LIST_HEAD (zombies); @@ -1957,7 +1957,8 @@ ksocknal_connect (ksock_route_t *route) /* re-queue for attention; this frees me up to handle * the peer's incoming connection request */ - if (rc == EALREADY) { + if (rc == EALREADY || + (rc == 0 && peer->ksnp_accepting > 0)) { /* We want to introduce a delay before next * attempt to connect if we lost conn race, * but the race is resolved quickly usually, @@ -1972,7 +1973,7 @@ ksocknal_connect (ksock_route_t *route) } cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); - return; + return retry_later; failed: cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); @@ -2021,6 +2022,7 @@ ksocknal_connect (ksock_route_t *route) ksocknal_peer_failed(peer); ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); + return 0; } /* Go through connd_routes queue looking for a route that @@ -2064,6 +2066,8 @@ ksocknal_connd (void *arg) ksock_route_t *route; cfs_waitlink_t wait; signed long timeout; + int nloops = 0; + int cons_retry = 0; int dropped_lock; snprintf (name, sizeof (name), "socknal_cd%02ld", id); @@ -2107,21 +2111,39 @@ ksocknal_connd (void *arg) cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); dropped_lock = 1; - ksocknal_connect (route); + if (ksocknal_connect(route)) { + /* consecutive retry */ + if (cons_retry++ > 10000) { + CWARN("massive consecutive " + "retry-connecting\n"); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + ksocknal_route_decref(route); cfs_spin_lock_bh (&ksocknal_data.ksnd_connd_lock); ksocknal_data.ksnd_connd_connecting--; } - if (dropped_lock) + if (dropped_lock) { + if (++nloops < SOCKNAL_RESCHED) + continue; + cfs_spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + nloops = 0; + cfs_cond_resched(); + cfs_spin_lock_bh(&ksocknal_data.ksnd_connd_lock); continue; + } /* Nothing to do for 'timeout' */ cfs_set_current_state (CFS_TASK_INTERRUPTIBLE); cfs_waitq_add_exclusive (&ksocknal_data.ksnd_connd_waitq, &wait); cfs_spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + nloops = 0; cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout); cfs_set_current_state (CFS_TASK_RUNNING); -- 1.8.3.1