From 7e17d3c67e5ead912af69be01a5ee81522baa551 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Mon, 29 Mar 2010 23:26:38 +0200 Subject: [PATCH] b=22065 ko2iblnd failover deadlock fix i=maxim i=liang --- lnet/klnds/o2iblnd/o2iblnd.c | 14 +++++--------- lustre/ChangeLog | 5 +++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 826449d..b322087 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -742,14 +742,10 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); write_lock_irqsave(glock, flags); - i = 0; - while (dev->ibd_failover) { + if (dev->ibd_failover) { write_unlock_irqrestore(glock, flags); - /* shouldn't take long time */ - if (i++ % 50 == 0) - CDEBUG(D_NET, "Wait for dev(%s) failover\n", dev->ibd_ifname); - cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(1) / 50); - write_lock_irqsave(glock, flags); + CERROR("%s: failover in progress\n", dev->ibd_ifname); + goto failed_2; } if (dev->ibd_hdev->ibh_ibdev != cmid->device) { @@ -2362,7 +2358,7 @@ kiblnd_dev_need_failover(kib_dev_t *dev) struct sockaddr_in dstaddr; int rc; - if (dev->ibd_hdev == NULL || /* intializing */ + if (dev->ibd_hdev == NULL || /* initializing */ dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ return 1; @@ -2599,7 +2595,7 @@ kiblnd_create_dev(char *ifname) dev->ibd_ifip = ip; strcpy(&dev->ibd_ifname[0], ifname); - /* intialize the device */ + /* initialize the device */ rc = kiblnd_dev_failover(dev); if (rc != 0) { CERROR("Can't initialize device: %d\n", rc); diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 10996f9..d92f546 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -49,6 +49,11 @@ Bugzilla : 20400 Description: Downgrade RHEL5 kernel to 2.6.18-128.1.14.el5 Details : BoM needs 2.6.18-128.1.14.el5, so revert the patch from bug 20400 +Severity : normal +Bugzilla : 22065 +Description: LNET hang +Details : fix deadlock with ko2iblnd failover + ------------------------------------------------------------------------------- 2009-10-16 Sun Microsystems, Inc. -- 1.8.3.1