LU-6128 lnet: handle lnet_check_routes() errors

[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd.c
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c

index fcc05fa..0ed8778 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.c
+++ b/lnet/klnds/gnilnd/gnilnd.c
@@ -1,7 +1,8 @@
  /*
   * Copyright (C) 2012 Cray, Inc.
   *
- *   Author: Igor Gorodetsky <iogordet@cray.com>
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ *
   *   Author: Nic Henke <nic@cray.com>
   *   Author: James Shimek <jshimek@cray.com>
   *
@@ -25,7 +26,11 @@
  
  /* Primary entry points from LNET.  There are no guarantees against reentrance. */
  lnd_t the_kgnilnd = {
+#ifdef CONFIG_CRAY_XT
         .lnd_type       = GNILND,
+#else
+       .lnd_type       = GNIIPLND,
+#endif
         .lnd_startup    = kgnilnd_startup,
         .lnd_shutdown   = kgnilnd_shutdown,
         .lnd_ctl        = kgnilnd_ctl,
@@ -36,7 +41,6 @@ lnd_t the_kgnilnd = {
  };
  
  kgn_data_t      kgnilnd_data;
-kgn_hssops_t   kgnilnd_hssops;
  
  /* needs write_lock on kgn_peer_conn_lock */
  int
@@ -177,9 +181,9 @@ kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
  int
  kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
  {
-       kgn_conn_t    *conn;
-       gni_return_t   rrc;
-       int            rc = 0;
+       kgn_conn_t      *conn;
+       gni_return_t    rrc;
+       int             rc = 0;
  
         LASSERT (!in_interrupt());
         atomic_inc(&kgnilnd_data.kgn_nconns);
@@ -201,13 +205,13 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
         LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
         if (conn->gnc_tx_ref_table == NULL) {
                 CERROR("Can't allocate conn tx_ref_table\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         atomic_set(&conn->gnc_refcount, 1);
         atomic_set(&conn->gnc_reaper_noop, 0);
         atomic_set(&conn->gnc_sched_noop, 0);
+       atomic_set(&conn->gnc_tx_in_use, 0);
         INIT_LIST_HEAD(&conn->gnc_list);
         INIT_LIST_HEAD(&conn->gnc_hashlist);
         INIT_LIST_HEAD(&conn->gnc_schedlist);
@@ -215,6 +219,7 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
         INIT_LIST_HEAD(&conn->gnc_mdd_list);
         spin_lock_init(&conn->gnc_list_lock);
         spin_lock_init(&conn->gnc_tx_lock);
+       conn->gnc_magic = GNILND_CONN_MAGIC;
  
         /* set tx id to nearly the end to make sure we find wrapping
          * issues soon */
@@ -231,8 +236,7 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
  
         if (conn->gnc_cqid == 0) {
                 CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
-               rc = -E2BIG;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -E2BIG);
         }
  
         CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
@@ -251,10 +255,8 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
         rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
                                 &conn->gnc_ephandle);
         mutex_unlock(&dev->gnd_cq_mutex);
-       if (rrc != GNI_RC_SUCCESS) {
-               rc = -ENETDOWN;
-               GOTO(failed, rc);
-       }
+       if (rrc != GNI_RC_SUCCESS)
+               GOTO(failed, rc = -ENETDOWN);
  
         CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
                conn, conn->gnc_ephandle);
@@ -278,7 +280,6 @@ kgn_conn_t *
  kgnilnd_find_conn_locked(kgn_peer_t *peer)
  {
         kgn_conn_t      *conn = NULL;
-       ENTRY;
  
         /* if we are in reset, this conn is going to die soon */
         if (unlikely(kgnilnd_data.kgn_in_reset)) {
@@ -399,13 +400,15 @@ kgnilnd_destroy_conn(kgn_conn_t *conn)
                 list_empty(&conn->gnc_list) &&
                 list_empty(&conn->gnc_hashlist) &&
                 list_empty(&conn->gnc_schedlist) &&
-               list_empty(&conn->gnc_mdd_list),
-               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+               list_empty(&conn->gnc_mdd_list) &&
+               conn->gnc_magic == GNILND_CONN_MAGIC,
+               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d\n",
                 conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
                                      : "<?>",
                 !!in_interrupt(), conn->gnc_scheduled,
                 conn->gnc_in_purgatory,
                 conn->gnc_ephandle,
+               conn->gnc_magic,
                 list_empty(&conn->gnc_list),
                 list_empty(&conn->gnc_hashlist),
                 list_empty(&conn->gnc_schedlist),
@@ -424,8 +427,16 @@ kgnilnd_destroy_conn(kgn_conn_t *conn)
         CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
                 conn, conn->gnc_ephandle, conn->gnc_error);
  
+       /* We are freeing this memory remove the magic value from the connection */
+       conn->gnc_magic = 0;
+
         /* if there is an FMA blk left here, we'll tear it down */
         if (conn->gnc_fma_blk) {
+               if (conn->gnc_peer) {
+                       kgn_mbox_info_t *mbox;
+                       mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+                       mbox->mbx_prev_nid = conn->gnc_peer->gnp_nid;
+               }
                 kgnilnd_release_mbox(conn, 0);
         }
  
@@ -574,7 +585,8 @@ kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
         }
  
         /* if we NETERROR, make sure it is rate limited */
-       if (!kgnilnd_conn_clean_errno(error)) {
+       if (!kgnilnd_conn_clean_errno(error) &&
+           peer->gnp_down == GNILND_RCA_NODE_UP) {
                 CNETERR("closing conn to %s: error %d\n",
                        libcfs_nid2str(peer->gnp_nid), error);
         } else {
@@ -600,6 +612,7 @@ kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
         /* Remove from conn hash table: no new callbacks */
         list_del_init(&conn->gnc_hashlist);
         kgnilnd_data.kgn_conn_version++;
+       kgnilnd_conn_decref(conn);
  
         /* if we are in reset, go right to CLOSED as there is no scheduler
          * thread to move from CLOSING to CLOSED */
@@ -628,11 +641,6 @@ kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
          * gnd_ready_conns and allows us to find it in quiesce processing */
         kgnilnd_schedule_conn(conn);
  
-       /* lose peer's ref */
-       kgnilnd_conn_decref(conn);
-       /* -1 for conn table */
-       kgnilnd_conn_decref(conn);
-
         EXIT;
  }
  
@@ -678,6 +686,17 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
         LASSERT(list_empty(&conn->gnc_hashlist));
  
         /* we've sent the close, start nuking */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SCHEDULE_COMPLETE))
+               kgnilnd_schedule_conn(conn);
+
+       if (conn->gnc_scheduled != GNILND_CONN_PROCESS) {
+               CDEBUG(D_NETERROR, "Error someone scheduled us after we were "
+                               "done, Attempting to recover conn 0x%p "
+                               "scheduled %d function: %s line: %d\n", conn,
+                               conn->gnc_scheduled, conn->gnc_sched_caller,
+                               conn->gnc_sched_line);
+               RETURN_EXIT;
+       }
  
         /* we don't use lists to track things that we can get out of the
          * tx_ref table... */
@@ -713,9 +732,13 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
  
         /* nobody should have marked this as needing scheduling after
          * we called close - so only ref should be us handling it */
-       LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
-                "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
-
+       if (conn->gnc_scheduled != GNILND_CONN_PROCESS) {
+               CDEBUG(D_NETERROR, "Error someone scheduled us after we were "
+                               "done, Attempting to recover conn 0x%p "
+                               "scheduled %d function %s line: %d\n", conn,
+                               conn->gnc_scheduled, conn->gnc_sched_caller,
+                               conn->gnc_sched_line);
+       }
         /* now reset a few to actual counters... */
         nrdma = atomic_read(&conn->gnc_nlive_rdma);
         nq_rdma = atomic_read(&conn->gnc_nq_rdma);
@@ -732,17 +755,17 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
         logmsg = (nlive + nrdma + nq_rdma);
  
         if (logmsg) {
-               if (conn->gnc_peer_error != 0) {
+               if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_UP) {
                         CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
                                 "canceled %d TX, %d/%d RDMA\n",
                                 conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
                                 conn->gnc_error, conn->gnc_peer_error,
                                 nlive, nq_rdma, nrdma);
                 } else {
-                       CNETERR("Closed conn 0x%p->%s (errno %d): "
-                               "canceled %d TX, %d/%d RDMA\n",
+                       CDEBUG(D_NET, "Closed conn 0x%p->%s (errno %d,"
+                               " peer errno %d): canceled %d TX, %d/%d RDMA\n",
                                 conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
-                               conn->gnc_error,
+                               conn->gnc_error, conn->gnc_peer_error,
                                 nlive, nq_rdma, nrdma);
                 }
         }
@@ -767,6 +790,8 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
         /* Remove from peer's list of valid connections if its not in purgatory */
         if (!conn->gnc_in_purgatory) {
                 list_del_init(&conn->gnc_list);
+               /* Lose peers reference on the conn */
+               kgnilnd_conn_decref(conn);
         }
  
         /* NB - only unlinking if we set pending in del_peer_locked from admin or
@@ -795,6 +820,7 @@ kgnilnd_set_conn_params(kgn_dgram_t *dgram)
         kgn_gniparams_t        *rem_param = &connreq->gncr_gnparams;
         gni_return_t            rrc;
         int                     rc = 0;
+       gni_smsg_attr_t        *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
  
         /* set timeout vals in conn early so we can use them for the NAK */
  
@@ -829,7 +855,6 @@ kgnilnd_set_conn_params(kgn_dgram_t *dgram)
                         &connreq->gncr_gnparams.gnpr_smsg_attr);
         if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
                 gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
-               gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
                 /* help folks figure out if there is a tunable off, etc. */
                 LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
                                " type %d/%d msg_maxsize %u/%u"
@@ -864,6 +889,7 @@ kgnilnd_set_conn_params(kgn_dgram_t *dgram)
  
         conn->gnc_peerstamp = connreq->gncr_peerstamp;
         conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+       conn->remote_mbox_addr = (void *)((char *)remote->msg_buffer + remote->mbox_offset);
  
         /* We update the reaper timeout once we have a valid conn and timeout */
         kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
@@ -890,10 +916,13 @@ return_out:
   * kgn_peer_conn_lock is held, we guarantee that nobody calls
   * kgnilnd_add_peer_locked without checking gnn_shutdown */
  int
-kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+kgnilnd_create_peer_safe(kgn_peer_t **peerp,
+                        lnet_nid_t nid,
+                        kgn_net_t *net,
+                        int node_state)
  {
-       kgn_peer_t    *peer;
-       int            rc;
+       kgn_peer_t      *peer;
+       int             rc;
  
         LASSERT(nid != LNET_NID_ANY);
  
@@ -922,6 +951,7 @@ kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
                 return -ENOMEM;
         }
         peer->gnp_nid = nid;
+       peer->gnp_down = node_state;
  
         /* translate from nid to nic addr & store */
         rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
@@ -1028,13 +1058,12 @@ kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer)
         CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
                 conn->gnc_device);
  
-       /* add ref for mbox purgatory hold */
-       kgnilnd_peer_addref(peer);
-       kgnilnd_conn_addref(conn);
+       LASSERTF(conn->gnc_in_purgatory == 0,
+               "Conn already in purgatory\n");
         conn->gnc_in_purgatory = 1;
  
         mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
-       mbox->mbx_prev_nid = peer->gnp_nid;
+       mbox->mbx_prev_purg_nid = peer->gnp_nid;
         mbox->mbx_add_purgatory = jiffies;
         kgnilnd_release_mbox(conn, 1);
  
@@ -1085,7 +1114,6 @@ kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list)
                  * on the peer's conn_list anymore.
                  */
  
-               kgnilnd_peer_decref(conn->gnc_peer);
                 list_del_init(&conn->gnc_list);
  
                 /* NB - only unlinking if we set pending in del_peer_locked from admin or
@@ -1253,9 +1281,6 @@ kgnilnd_get_peer_info(int index,
                 list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
                         peer = list_entry(ptmp, kgn_peer_t, gnp_list);
  
-                       if (peer->gnp_nid != *id)
-                               continue;
-
                         if (index-- > 0)
                                 continue;
  
@@ -1316,11 +1341,14 @@ kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
  {
         kgn_peer_t        *peer;
         int                rc;
+       int                node_state;
         ENTRY;
  
         if (nid == LNET_NID_ANY)
                 return -EINVAL;
  
+       node_state = kgnilnd_get_node_state(LNET_NIDADDR(nid));
+
         /* NB - this will not block during normal operations -
          * the only writer of this is in the startup/shutdown path. */
         rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
@@ -1328,7 +1356,7 @@ kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
                 rc = -ESHUTDOWN;
                 RETURN(rc);
         }
-       rc = kgnilnd_create_peer_safe(&peer, nid, net);
+       rc = kgnilnd_create_peer_safe(&peer, nid, net, node_state);
         if (rc != 0) {
                 up_read(&kgnilnd_data.kgn_net_rw_sem);
                 RETURN(rc);
@@ -1495,9 +1523,6 @@ kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
  
         write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
  
-       /* release all of the souls found held in purgatory */
-       kgnilnd_release_purgatory_list(&souls);
-
         /* nuke peer TX */
         kgnilnd_txlist_done(&zombies, error);
  
@@ -1628,6 +1653,103 @@ kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why)
  }
  
  int
+kgnilnd_report_node_state(lnet_nid_t nid, int down)
+{
+       int         rc;
+       kgn_peer_t  *peer, *new_peer;
+       LIST_HEAD(zombies);
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       peer = kgnilnd_find_peer_locked(nid);
+
+       if (peer == NULL) {
+               int       i;
+               int       found_net = 0;
+               kgn_net_t *net;
+
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+               /* Don't add a peer for node up events */
+               if (down == GNILND_RCA_NODE_UP) {
+                       return 0;
+               }
+
+               /* find any valid net - we don't care which one... */
+               down_read(&kgnilnd_data.kgn_net_rw_sem);
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+                       list_for_each_entry(net, &kgnilnd_data.kgn_nets[i],
+                                           gnn_list) {
+                               found_net = 1;
+                               break;
+                       }
+
+                       if (found_net) {
+                               break;
+                       }
+               }
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+               if (!found_net) {
+                       CNETERR("Could not find a net for nid %lld\n", nid);
+                       return 1;
+               }
+
+               /* The nid passed in does not yet contain the net portion.
+                * Let's build it up now
+                */
+               nid = LNET_MKNID(LNET_NIDNET(net->gnn_ni->ni_nid), nid);
+               rc = kgnilnd_add_peer(net, nid, &new_peer);
+
+               if (rc) {
+                       CNETERR("Could not add peer for nid %lld, rc %d\n",
+                               nid, rc);
+                       return 1;
+               }
+
+               write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+               peer = kgnilnd_find_peer_locked(nid);
+
+               if (peer == NULL) {
+                       CNETERR("Could not find peer for nid %lld\n", nid);
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       return 1;
+               }
+       }
+
+       peer->gnp_down = down;
+
+       if (down == GNILND_RCA_NODE_DOWN) {
+               kgn_conn_t *conn;
+
+               peer->gnp_down_event_time = jiffies;
+               kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+               conn = kgnilnd_find_conn_locked(peer);
+
+               if (conn != NULL) {
+                       kgnilnd_close_conn_locked(conn, -ENETRESET);
+               }
+       } else {
+               peer->gnp_up_event_time = jiffies;
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (down == GNILND_RCA_NODE_DOWN) {
+               /* using ENETRESET so we don't get messages from
+                * kgnilnd_tx_done
+                */
+               kgnilnd_txlist_done(&zombies, -ENETRESET);
+
+               if (*kgnilnd_tunables.kgn_peer_health) {
+                       kgnilnd_peer_notify(peer, -ECONNRESET);
+               }
+       }
+
+       CDEBUG(D_INFO, "marking nid %lld %s\n", nid, down ? "down" : "up");
+       return 0;
+}
+
+int
  kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
  {
         struct libcfs_ioctl_data *data = arg;
@@ -1754,7 +1876,10 @@ kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
         kgn_tx_t                *tx;
         kgn_peer_t              *peer = NULL;
         kgn_conn_t              *conn = NULL;
-       lnet_process_id_t       id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+       lnet_process_id_t       id = {
+               .nid = nid,
+               .pid = LNET_PID_LUSTRE,
+       };
         ENTRY;
  
         /* I expect to find him, so only take a read lock */
@@ -1834,8 +1959,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
                                  &dev->gnd_domain);
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
-               rc = -ENODEV;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENODEV);
         }
  
         rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
@@ -1843,28 +1967,28 @@ kgnilnd_dev_init(kgn_device_t *dev)
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't attach CDM to device %d (%d)\n",
                         dev->gnd_id, rrc);
-               rc = -ENODEV;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENODEV);
         }
  
+       /* a bit gross, but not much we can do - Aries Sim doesn't have
+        * hardcoded NIC/NID that we can use */
         rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
-       if (rc != 0) {
-               rc = -ENODEV;
-               GOTO(failed, rc);
-       }
+       if (rc != 0)
+               GOTO(failed, rc = -ENODEV);
  
         /* only dev 0 gets the errors - no need to reset the stack twice
          * - this works because we have a single PTAG, if we had more
          * then we'd need to have multiple handlers */
         if (dev->gnd_id == 0) {
-               rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+               rrc = kgnilnd_subscribe_errors(dev->gnd_handle,
+                                               GNI_ERRMASK_CRITICAL |
+                                               GNI_ERRMASK_UNKNOWN_TRANSACTION,
                                               0, NULL, kgnilnd_critical_error,
                                               &dev->gnd_err_handle);
                 if (rrc != GNI_RC_SUCCESS) {
                         CERROR("Can't subscribe for errors on device %d: rc %d\n",
                                 dev->gnd_id, rrc);
-                       rc = -ENODEV;
-                       GOTO(failed, rc);
+                       GOTO(failed, rc = -ENODEV);
                 }
  
                 rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
@@ -1872,11 +1996,16 @@ kgnilnd_dev_init(kgn_device_t *dev)
                 if (rc != GNI_RC_SUCCESS) {
                         CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
                                 dev->gnd_id, rrc);
-                       rc = -ENODEV;
-                       GOTO(failed, rc);
+                       GOTO(failed, rc = -ENODEV);
                 }
         }
  
+       rrc = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_IP, &kgnilnd_data.kgn_sock);
+       if (rrc < 0) {
+               CERROR("sock_create returned %d\n", rrc);
+               GOTO(failed, rrc);
+       }
+
         rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
         if (rc < 0) {
                 /* log messages during startup */
@@ -1884,8 +2013,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
                         CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
                                 dev->gnd_host_id, rc);
                 }
-               rc = -ESRCH;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ESRCH);
         }
         CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
  
@@ -1895,8 +2023,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't create rdma send cq size %u for device "
                        "%d (%d)\n", cq_size, dev->gnd_id, rrc);
-               rc = -EINVAL;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -EINVAL);
         }
  
         rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
@@ -1905,8 +2032,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't create fma send cq size %u for device %d (%d)\n",
                        cq_size, dev->gnd_id, rrc);
-               rc = -EINVAL;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -EINVAL);
         }
  
         /* This one we size differently - overflows are possible and it needs to be
@@ -1918,8 +2044,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't create fma cq size %d for device %d (%d)\n",
                        *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
-               rc = -EINVAL;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -EINVAL);
         }
  
         RETURN(0);
@@ -2007,6 +2132,8 @@ kgnilnd_dev_fini(kgn_device_t *dev)
                 dev->gnd_domain = NULL;
         }
  
+       sock_release(kgnilnd_data.kgn_sock);
+
         EXIT;
  }
  
@@ -2026,7 +2153,6 @@ int kgnilnd_base_startup(void)
  
         /* zero pointers, flags etc */
         memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
-       memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
  
         /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
          * a unique (for all time) connstamp so we can uniquely identify
@@ -2049,7 +2175,7 @@ int kgnilnd_base_startup(void)
                 INIT_LIST_HEAD(&dev->gnd_map_tx);
                 INIT_LIST_HEAD(&dev->gnd_fma_buffs);
                 mutex_init(&dev->gnd_cq_mutex);
-               sema_init(&dev->gnd_fmablk_sem, 1);
+               mutex_init(&dev->gnd_fmablk_mutex);
                 spin_lock_init(&dev->gnd_fmablk_lock);
                 init_waitqueue_head(&dev->gnd_waitq);
                 init_waitqueue_head(&dev->gnd_dgram_waitq);
@@ -2066,24 +2192,28 @@ int kgnilnd_base_startup(void)
                 spin_lock_init(&dev->gnd_dgram_lock);
                 spin_lock_init(&dev->gnd_rdmaq_lock);
                 INIT_LIST_HEAD(&dev->gnd_rdmaq);
+               init_rwsem(&dev->gnd_conn_sem);
  
                 /* alloc & setup nid based dgram table */
                 LIBCFS_ALLOC(dev->gnd_dgrams,
                             sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
  
-               if (dev->gnd_dgrams == NULL) {
-                       rc = -ENOMEM;
-                       GOTO(failed, rc);
-               }
+               if (dev->gnd_dgrams == NULL)
+                       GOTO(failed, rc = -ENOMEM);
  
                 for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
                         INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
                 }
                 atomic_set(&dev->gnd_ndgrams, 0);
-
+               atomic_set(&dev->gnd_nwcdgrams, 0);
                 /* setup timer for RDMAQ processing */
                 setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
                             (unsigned long)dev);
+
+               /* setup timer for mapping processing */
+               setup_timer(&dev->gnd_map_timer, kgnilnd_schedule_device_timer,
+                           (unsigned long)dev);
+
         }
  
         /* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
@@ -2093,24 +2223,26 @@ int kgnilnd_base_startup(void)
         init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
         spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
  
-       sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+       mutex_init(&kgnilnd_data.kgn_quiesce_mutex);
         atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
         atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
         atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
         atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+       atomic_set(&kgnilnd_data.kgn_rev_offset, 0);
+       atomic_set(&kgnilnd_data.kgn_rev_length, 0);
+       atomic_set(&kgnilnd_data.kgn_rev_copy_buff, 0);
+
         /* OK to call kgnilnd_api_shutdown() to cleanup now */
         kgnilnd_data.kgn_init = GNILND_INIT_DATA;
-       PORTAL_MODULE_USE;
+       try_module_get(THIS_MODULE);
  
         rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
  
         LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
                     sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
  
-       if (kgnilnd_data.kgn_peers == NULL) {
-               rc = -ENOMEM;
-               GOTO(failed, rc);
-       }
+       if (kgnilnd_data.kgn_peers == NULL)
+               GOTO(failed, rc = -ENOMEM);
  
         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
                 INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
@@ -2119,10 +2251,8 @@ int kgnilnd_base_startup(void)
         LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
                     sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
  
-       if (kgnilnd_data.kgn_conns == NULL) {
-               rc = -ENOMEM;
-               GOTO(failed, rc);
-       }
+       if (kgnilnd_data.kgn_conns == NULL)
+               GOTO(failed, rc = -ENOMEM);
  
         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
                 INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
@@ -2131,68 +2261,49 @@ int kgnilnd_base_startup(void)
         LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
                     sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
  
-       if (kgnilnd_data.kgn_nets == NULL) {
-               rc = -ENOMEM;
-               GOTO(failed, rc);
-       }
+       if (kgnilnd_data.kgn_nets == NULL)
+               GOTO(failed, rc = -ENOMEM);
  
         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
                 INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
         }
  
         kgnilnd_data.kgn_mbox_cache =
-               cfs_mem_cache_create("kgn_mbox_block",
-                                    KMALLOC_MAX_SIZE,
-                                    0,    /* offset */
-                                    SLAB_HWCACHE_ALIGN);   /* flags */
+               kmem_cache_create("kgn_mbox_block", KMALLOC_MAX_SIZE, 0,
+                                 SLAB_HWCACHE_ALIGN, NULL);
         if (kgnilnd_data.kgn_mbox_cache == NULL) {
                 CERROR("Can't create slab for physical mbox blocks\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         kgnilnd_data.kgn_rx_cache =
-               cfs_mem_cache_create("kgn_rx_t",
-                                    sizeof(kgn_rx_t),
-                                    0,    /* offset */
-                                    0);   /* flags */
+               kmem_cache_create("kgn_rx_t", sizeof(kgn_rx_t), 0, 0, NULL);
         if (kgnilnd_data.kgn_rx_cache == NULL) {
                 CERROR("Can't create slab for kgn_rx_t descriptors\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         kgnilnd_data.kgn_tx_cache =
-               cfs_mem_cache_create("kgn_tx_t",
-                                    sizeof(kgn_tx_t),
-                                    0,    /* offset */
-                                    0);   /* flags */
+               kmem_cache_create("kgn_tx_t", sizeof(kgn_tx_t), 0, 0, NULL);
         if (kgnilnd_data.kgn_tx_cache == NULL) {
                 CERROR("Can't create slab for kgn_tx_t\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         kgnilnd_data.kgn_tx_phys_cache =
-               cfs_mem_cache_create("kgn_tx_phys",
-                                    LNET_MAX_IOV * sizeof(gni_mem_segment_t),
-                                    0,    /* offset */
-                                    0);   /* flags */
+               kmem_cache_create("kgn_tx_phys",
+                                  LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+                                  0, 0, NULL);
         if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
                 CERROR("Can't create slab for kgn_tx_phys\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         kgnilnd_data.kgn_dgram_cache =
-               cfs_mem_cache_create("kgn_dgram_t",
-                                    sizeof(kgn_dgram_t),
-                                    0,    /* offset */
-                                    0);   /* flags */
+               kmem_cache_create("kgn_dgram_t", sizeof(kgn_dgram_t), 0, 0, NULL);
         if (kgnilnd_data.kgn_dgram_cache == NULL) {
                 CERROR("Can't create slab for outgoing datagrams\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
  
         /* allocate a MAX_IOV array of page pointers for each cpu */
@@ -2200,8 +2311,7 @@ int kgnilnd_base_startup(void)
                                                    GFP_KERNEL);
         if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
                 CERROR("Can't allocate vmap cksum pages\n");
-               rc = -ENOMEM;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
         kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
         memset(kgnilnd_data.kgn_cksum_map_pages, 0,
@@ -2212,8 +2322,7 @@ int kgnilnd_base_startup(void)
                                                               GFP_KERNEL);
                 if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
                         CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
-                       rc = -ENOMEM;
-                       GOTO(failed, rc);
+                       GOTO(failed, rc = -ENOMEM);
                 }
         }
  
@@ -2229,16 +2338,14 @@ int kgnilnd_base_startup(void)
                         kgnilnd_data.kgn_ndevs++;
  
                         rc = kgnilnd_allocate_phys_fmablk(dev);
-                       if (rc) {
+                       if (rc)
                                 GOTO(failed, rc);
-                       }
                 }
         }
  
         if (kgnilnd_data.kgn_ndevs == 0) {
                 CERROR("Can't initialise any GNI devices\n");
-               rc = -ENODEV;
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENODEV);
         }
  
         rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
@@ -2247,6 +2354,12 @@ int kgnilnd_base_startup(void)
                 GOTO(failed, rc);
         }
  
+       rc = kgnilnd_start_rca_thread();
+       if (rc != 0) {
+               CERROR("Can't spawn gnilnd rca: %d\n", rc);
+               GOTO(failed, rc);
+       }
+
         /*
          * Start ruhroh thread.  We can't use kgnilnd_thread_start() because
          * we don't want this thread included in kgnilnd_data.kgn_nthreads
@@ -2316,7 +2429,7 @@ failed:
  void
  kgnilnd_base_shutdown(void)
  {
-       int           i;
+       int                     i, j;
         ENTRY;
  
         while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
@@ -2326,10 +2439,29 @@ kgnilnd_base_shutdown(void)
         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
                 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
                 kgnilnd_cancel_wc_dgrams(dev);
+               kgnilnd_cancel_dgrams(dev);
                 kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
                 kgnilnd_wait_for_canceled_dgrams(dev);
         }
  
+       /* We need to verify there are no conns left before we let the threads
+        * shut down otherwise we could clean up the peers but still have
+        * some outstanding conns due to orphaned datagram conns that are
+        * being cleaned up.
+        */
+       i = 2;
+       while (atomic_read(&kgnilnd_data.kgn_nconns) != 0) {
+               i++;
+
+               for(j = 0; j < kgnilnd_data.kgn_ndevs; ++j) {
+                       kgn_device_t *dev = &kgnilnd_data.kgn_devices[j];
+                       kgnilnd_schedule_device(dev);
+               }
+
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                       "Waiting for conns to be cleaned up %d\n",atomic_read(&kgnilnd_data.kgn_nconns));
+               cfs_pause(cfs_time_seconds(1));
+       }
         /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
          * have to worry about shutdown races.  NB connections may be created
          * while there are still active connds, but these will be temporary
@@ -2357,7 +2489,7 @@ kgnilnd_base_shutdown(void)
                 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
  
                 /* should clear all the MDDs */
-               kgnilnd_unmap_phys_fmablk(dev);
+               kgnilnd_unmap_fma_blocks(dev);
  
                 kgnilnd_schedule_device(dev);
                 wake_up_all(&dev->gnd_dgram_waitq);
@@ -2369,6 +2501,8 @@ kgnilnd_base_shutdown(void)
         wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
         spin_unlock(&kgnilnd_data.kgn_reaper_lock);
  
+       kgnilnd_wakeup_rca_thread();
+
         /* Wait for threads to exit */
         i = 2;
         while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
@@ -2433,30 +2567,20 @@ kgnilnd_base_shutdown(void)
                 kgnilnd_free_phys_fmablk(dev);
         }
  
-       if (kgnilnd_data.kgn_mbox_cache != NULL) {
-               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
-               LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
-       }
+       if (kgnilnd_data.kgn_mbox_cache != NULL)
+               kmem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
  
-       if (kgnilnd_data.kgn_rx_cache != NULL) {
-               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
-               LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
-       }
+       if (kgnilnd_data.kgn_rx_cache != NULL)
+               kmem_cache_destroy(kgnilnd_data.kgn_rx_cache);
  
-       if (kgnilnd_data.kgn_tx_cache != NULL) {
-               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
-               LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
-       }
+       if (kgnilnd_data.kgn_tx_cache != NULL)
+               kmem_cache_destroy(kgnilnd_data.kgn_tx_cache);
  
-       if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
-               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
-               LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
-       }
+       if (kgnilnd_data.kgn_tx_phys_cache != NULL)
+               kmem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
  
-       if (kgnilnd_data.kgn_dgram_cache != NULL) {
-               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
-               LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
-       }
+       if (kgnilnd_data.kgn_dgram_cache != NULL)
+               kmem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
  
         if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
                 for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
@@ -2471,7 +2595,7 @@ kgnilnd_base_shutdown(void)
                atomic_read(&libcfs_kmemory));
  
         kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
-       PORTAL_MODULE_UNUSE;
+       module_put(THIS_MODULE);
  
         EXIT;
  }
@@ -2494,14 +2618,13 @@ kgnilnd_startup(lnet_ni_t *ni)
         }
  
         /* Serialize with shutdown. */
-       down(&kgnilnd_data.kgn_quiesce_sem);
+       mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
  
         LIBCFS_ALLOC(net, sizeof(*net));
         if (net == NULL) {
                 CERROR("could not allocate net for new interface instance\n");
-               rc = -ENOMEM;
                 /* no need to cleanup the CDM... */
-               GOTO(failed, rc);
+               GOTO(failed, rc = -ENOMEM);
         }
         INIT_LIST_HEAD(&net->gnn_list);
         ni->ni_data = net;
@@ -2511,12 +2634,23 @@ kgnilnd_startup(lnet_ni_t *ni)
  
         if (*kgnilnd_tunables.kgn_peer_health) {
                 int     fudge;
-
+               int     timeout;
                 /* give this a bit of leeway - we don't have a hard timeout
                  * as we only check timeouts periodically - see comment in kgnilnd_reaper */
                 fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
-
-               ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+               timeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+               if (*kgnilnd_tunables.kgn_peer_timeout >= timeout)
+                       ni->ni_peertimeout = *kgnilnd_tunables.kgn_peer_timeout;
+               else if (*kgnilnd_tunables.kgn_peer_timeout > -1) {
+                       LCONSOLE_ERROR("Peer_timeout is set to %d but needs to be >= %d\n",
+                                       *kgnilnd_tunables.kgn_peer_timeout,
+                                       timeout);
+                       ni->ni_data = NULL;
+                       LIBCFS_FREE(net, sizeof(*net));
+                       GOTO(failed, rc = -EINVAL);
+               } else
+                       ni->ni_peertimeout = timeout;
  
                 LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
                               ni->ni_peertimeout);
@@ -2551,10 +2685,10 @@ kgnilnd_startup(lnet_ni_t *ni)
  
         /* we need a separate thread to call probe_wait_by_id until
          * we get a function callback notifier from kgni */
-       up(&kgnilnd_data.kgn_quiesce_sem);
+       mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
         RETURN(0);
   failed:
-       up(&kgnilnd_data.kgn_quiesce_sem);
+       mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
         kgnilnd_shutdown(ni);
         RETURN(rc);
  }
@@ -2573,14 +2707,13 @@ kgnilnd_shutdown(lnet_ni_t *ni)
                 "init %d\n", kgnilnd_data.kgn_init);
  
         /* Serialize with startup. */
-       down(&kgnilnd_data.kgn_quiesce_sem);
+       mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
                atomic_read(&libcfs_kmemory));
  
         if (net == NULL) {
                 CERROR("got NULL net for ni %p\n", ni);
-               rc = -EINVAL;
-               GOTO(out, rc);
+               GOTO(out, rc = -EINVAL);
         }
  
         LASSERTF(ni == net->gnn_ni,
@@ -2657,9 +2790,8 @@ out:
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
                atomic_read(&libcfs_kmemory));
  
-       up(&kgnilnd_data.kgn_quiesce_sem);
+       mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
         EXIT;
-       return;
  }
  
  void __exit