Whamcloud - gitweb
b=22065 ko2iblnd failover deadlock fix
authorJohann Lombardi <johann@sun.com>
Mon, 29 Mar 2010 21:26:38 +0000 (23:26 +0200)
committerJohann Lombardi <johann@sun.com>
Mon, 29 Mar 2010 21:26:38 +0000 (23:26 +0200)
i=maxim
i=liang

lnet/klnds/o2iblnd/o2iblnd.c
lustre/ChangeLog

index 826449d..b322087 100644 (file)
@@ -742,14 +742,10 @@ kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
         memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
 
         write_lock_irqsave(glock, flags);
-        i = 0;
-        while (dev->ibd_failover) {
+        if (dev->ibd_failover) {
                 write_unlock_irqrestore(glock, flags);
-                /* shouldn't take long time */
-                if (i++ % 50 == 0)
-                        CDEBUG(D_NET, "Wait for dev(%s) failover\n", dev->ibd_ifname);
-                cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(1) / 50);
-                write_lock_irqsave(glock, flags);
+                CERROR("%s: failover in progress\n", dev->ibd_ifname);
+                goto failed_2;
         }
 
         if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
@@ -2362,7 +2358,7 @@ kiblnd_dev_need_failover(kib_dev_t *dev)
         struct sockaddr_in  dstaddr;
         int                 rc;
 
-        if (dev->ibd_hdev == NULL || /* intializing */
+        if (dev->ibd_hdev == NULL || /* initializing */
             dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
             *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
                 return 1;
@@ -2599,7 +2595,7 @@ kiblnd_create_dev(char *ifname)
         dev->ibd_ifip = ip;
         strcpy(&dev->ibd_ifname[0], ifname);
 
-        /* intialize the device */
+        /* initialize the device */
         rc = kiblnd_dev_failover(dev);
         if (rc != 0) {
                 CERROR("Can't initialize device: %d\n", rc);
index 10996f9..d92f546 100644 (file)
@@ -49,6 +49,11 @@ Bugzilla   : 20400
 Description: Downgrade RHEL5 kernel to 2.6.18-128.1.14.el5
 Details    : BoM needs 2.6.18-128.1.14.el5, so revert the patch from bug 20400
 
+Severity   : normal
+Bugzilla   : 22065
+Description: LNET hang
+Details    : fix deadlock with ko2iblnd failover
+
 -------------------------------------------------------------------------------
 
 2009-10-16 Sun Microsystems, Inc.