Whamcloud - gitweb
LU-1799 o2iblnd: debug patch for o2iblnd
authorLiang Zhen <liang@whamcloud.com>
Wed, 29 Aug 2012 13:24:26 +0000 (21:24 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 13 Sep 2012 13:27:29 +0000 (09:27 -0400)
IBM reported kernel panic on their BGQ IO node when loading the
ptlrpc module with an o2ib network. The IB interface had an IPv4
and IPv6 address. Removing the IPv6 address avoided the crash.

I suspect rdma_bind_addr can't associate any RDMA device in this
case, this patch will check if there's attached IB device on cmid
even returned value is ZERO, it will also output more information.

Test-Parameters: nettypes=o2ib
Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: Id44110fcf56b199b1504ab4e6b0157d87bc2d270
Reviewed-on: http://review.whamcloud.com/3815
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.c

index a3cd78a..35a6132 100644 (file)
@@ -2544,14 +2544,14 @@ kiblnd_dev_need_failover(kib_dev_t *dev)
         dstaddr.sin_family = AF_INET;
         rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
                                (struct sockaddr *)&dstaddr, 1);
-        if (rc != 0) {
-                CERROR("Failed to bind %s to device: %d\n",
-                       dev->ibd_ifname, rc);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
                 rdma_destroy_id(cmid);
                 return rc;
         }
 
-        LASSERT (cmid->device != NULL);
         if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
                 /* don't need device failover */
                 rdma_destroy_id(cmid);
@@ -2617,9 +2617,10 @@ kiblnd_dev_failover(kib_dev_t *dev)
 
         /* Bind to failover device or port */
         rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
-        if (rc != 0) {
-                CERROR("Failed to bind %s to device: %d\n",
-                       dev->ibd_ifname, rc);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
                 rdma_destroy_id(cmid);
                 goto out;
         }