LU-6261 gnilnd: Add pkey module parameter

[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd.c
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c

index 84829e5..7e67339 100644 (file)
--- a/lnet/klnds/gnilnd/gnilnd.c
+++ b/lnet/klnds/gnilnd/gnilnd.c
@@ -1,6 +1,8 @@
  /*
   * Copyright (C) 2012 Cray, Inc.
   *
+ * Copyright (c) 2013, 2014, Intel Corporation.
+ *
   *   Author: Nic Henke <nic@cray.com>
   *   Author: James Shimek <jshimek@cray.com>
   *
@@ -40,6 +42,52 @@ lnd_t the_kgnilnd = {
  
  kgn_data_t      kgnilnd_data;
  
+int
+kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
+{
+       struct task_struct *thrd;
+
+       thrd = kthread_run(fn, arg, "%s_%02d", name, id);
+       if (IS_ERR(thrd))
+               return PTR_ERR(thrd);
+
+       atomic_inc(&kgnilnd_data.kgn_nthreads);
+       return 0;
+}
+
+/* bind scheduler threads to cpus */
+int
+kgnilnd_start_sd_threads(void)
+{
+       int cpu;
+       int i = 0;
+       struct task_struct *task;
+
+       for_each_online_cpu(cpu) {
+               /* don't bind to cpu 0 - all interrupts are processed here */
+               if (cpu == 0)
+                       continue;
+
+               task = kthread_create(kgnilnd_scheduler, (void *)((long)i),
+                                     "%s_%02d", "kgnilnd_sd", i);
+               if (!IS_ERR(task)) {
+                       kthread_bind(task, cpu);
+                       wake_up_process(task);
+               } else {
+                       CERROR("Can't spawn gnilnd scheduler[%d] %ld\n", i,
+                               PTR_ERR(task));
+                       return PTR_ERR(task);
+               }
+               atomic_inc(&kgnilnd_data.kgn_nthreads);
+
+               if (++i >= *kgnilnd_tunables.kgn_sched_threads) {
+                       break;
+               }
+       }
+
+       return 0;
+}
+
  /* needs write_lock on kgn_peer_conn_lock */
  int
  kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
@@ -206,6 +254,8 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
                 GOTO(failed, rc = -ENOMEM);
         }
  
+       mutex_init(&conn->gnc_smsg_mutex);
+       mutex_init(&conn->gnc_rdma_mutex);
         atomic_set(&conn->gnc_refcount, 1);
         atomic_set(&conn->gnc_reaper_noop, 0);
         atomic_set(&conn->gnc_sched_noop, 0);
@@ -458,7 +508,7 @@ kgnilnd_peer_alive(kgn_peer_t *peer)
  }
  
  void
-kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive)
  {
         int                     tell_lnet = 0;
         int                     nnets = 0;
@@ -487,10 +537,10 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error)
                peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
                kgnilnd_data.kgn_in_reset, error);
  
-       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+       if (((peer->gnp_connecting == GNILND_PEER_IDLE) &&
             (conn == NULL) &&
             (!kgnilnd_data.kgn_in_reset) &&
-           (!kgnilnd_conn_clean_errno(error))) {
+           (!kgnilnd_conn_clean_errno(error))) || alive) {
                 tell_lnet = 1;
         }
  
@@ -554,8 +604,8 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error)
                                 peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
                                 cfs_duration_sec(jiffies - peer->gnp_last_alive));
  
-                       lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
-
+                       lnet_notify(net->gnn_ni, peer_nid, alive,
+                                   peer->gnp_last_alive);
  
                         kgnilnd_net_decref(net);
                 }
@@ -804,8 +854,8 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn)
  
         /* I'm telling Mommy! - use peer_error if they initiated close */
         kgnilnd_peer_notify(conn->gnc_peer,
-                           conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
-                                                          : conn->gnc_error);
+                           conn->gnc_error == -ECONNRESET ?
+                           conn->gnc_peer_error : conn->gnc_error, 0);
  
         EXIT;
  }
@@ -1164,7 +1214,7 @@ kgnilnd_release_purgatory_list(struct list_head *conn_list)
                  * make sure we tell LNet - if this is from other context,
                  * the checks in the function will prevent an errant
                  * notification */
-               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error, 0);
  
                 list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
                                          gmp_list) {
@@ -1542,7 +1592,8 @@ kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
                atomic_read(&kgnilnd_data.kgn_npending_detach)  ||
                atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
  
-               cfs_pause(cfs_time_seconds(1));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(cfs_time_seconds(1));
                 i++;
  
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
@@ -1612,8 +1663,8 @@ kgnilnd_get_conn_info(kgn_peer_t *peer,
  
         *device_id = conn->gnc_device->gnd_host_id;
         *peerstamp = conn->gnc_peerstamp;
-       *tx_seq = conn->gnc_tx_seq;
-       *rx_seq = conn->gnc_rx_seq;
+       *tx_seq = atomic_read(&conn->gnc_tx_seq);
+       *rx_seq = atomic_read(&conn->gnc_rx_seq);
         *fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq);
         *nfma = atomic_read(&conn->gnc_nlive_fma);
         *nrdma = atomic_read(&conn->gnc_nlive_rdma);
@@ -1737,13 +1788,10 @@ kgnilnd_report_node_state(lnet_nid_t nid, int down)
                  * kgnilnd_tx_done
                  */
                 kgnilnd_txlist_done(&zombies, -ENETRESET);
-
-               if (*kgnilnd_tunables.kgn_peer_health) {
-                       kgnilnd_peer_notify(peer, -ECONNRESET);
-               }
+               kgnilnd_peer_notify(peer, -ECONNRESET, 0);
+               LCONSOLE_INFO("Recieved down event for nid %lld\n", nid);
         }
  
-       CDEBUG(D_INFO, "marking nid %lld %s\n", nid, down ? "down" : "up");
         return 0;
  }
  
@@ -1874,7 +1922,10 @@ kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
         kgn_tx_t                *tx;
         kgn_peer_t              *peer = NULL;
         kgn_conn_t              *conn = NULL;
-       lnet_process_id_t       id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+       lnet_process_id_t       id = {
+               .nid = nid,
+               .pid = LNET_PID_LUSTRE,
+       };
         ENTRY;
  
         /* I expect to find him, so only take a read lock */
@@ -1950,7 +2001,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3;
  
         rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag,
-                                GNILND_COOKIE, 0,
+                                *kgnilnd_tunables.kgn_pkey, 0,
                                  &dev->gnd_domain);
         if (rrc != GNI_RC_SUCCESS) {
                 CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
@@ -2012,7 +2063,7 @@ kgnilnd_dev_init(kgn_device_t *dev)
         }
         CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
  
-       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+       rrc = kgnilnd_cq_create(dev->gnd_handle, *kgnilnd_tunables.kgn_credits,
                                 0, kgnilnd_device_callback,
                                 dev->gnd_id, &dev->gnd_snd_rdma_cqh);
         if (rrc != GNI_RC_SUCCESS) {
@@ -2127,12 +2178,12 @@ kgnilnd_dev_fini(kgn_device_t *dev)
                 dev->gnd_domain = NULL;
         }
  
-       sock_release(kgnilnd_data.kgn_sock);
+       if (kgnilnd_data.kgn_sock)
+               sock_release(kgnilnd_data.kgn_sock);
  
         EXIT;
  }
  
-
  int kgnilnd_base_startup(void)
  {
         struct timeval       tv;
@@ -2141,6 +2192,15 @@ int kgnilnd_base_startup(void)
         int                  i;
         kgn_device_t        *dev;
         struct task_struct  *thrd;
+
+#if defined(CONFIG_CRAY_XT) && !defined(CONFIG_CRAY_COMPUTE)
+       /* limit how much memory can be allocated for fma blocks in
+        * instances where many nodes need to reconnects at the same time */
+       struct sysinfo si;
+       si_meminfo(&si);
+       kgnilnd_data.free_pages_limit = si.totalram/4;
+#endif
+
         ENTRY;
  
         LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
@@ -2148,6 +2208,7 @@ int kgnilnd_base_startup(void)
  
         /* zero pointers, flags etc */
         memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
+       kgnilnd_check_kgni_version();
  
         /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
          * a unique (for all time) connstamp so we can uniquely identify
@@ -2369,13 +2430,20 @@ int kgnilnd_base_startup(void)
         }
  
         /* threads will load balance across devs as they are available */
-       for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
-               rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i),
-                                         "kgnilnd_sd", i);
-               if (rc != 0) {
-                       CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
-                              i, rc);
+       if (*kgnilnd_tunables.kgn_thread_affinity) {
+               rc = kgnilnd_start_sd_threads();
+               if (rc != 0)
                         GOTO(failed, rc);
+       } else {
+               for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
+                       rc = kgnilnd_thread_start(kgnilnd_scheduler,
+                                                 (void *)((long)i),
+                                                 "kgnilnd_sd", i);
+                       if (rc != 0) {
+                               CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
+                                      i, rc);
+                               GOTO(failed, rc);
+                       }
                 }
         }
  
@@ -2406,8 +2474,6 @@ int kgnilnd_base_startup(void)
                 }
         }
  
-
-
         /* flag everything initialised */
         kgnilnd_data.kgn_init = GNILND_INIT_ALL;
         /*****************************************************/
@@ -2455,7 +2521,8 @@ kgnilnd_base_shutdown(void)
  
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
                         "Waiting for conns to be cleaned up %d\n",atomic_read(&kgnilnd_data.kgn_nconns));
-               cfs_pause(cfs_time_seconds(1));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(cfs_time_seconds(1));
         }
         /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
          * have to worry about shutdown races.  NB connections may be created
@@ -2474,7 +2541,8 @@ kgnilnd_base_shutdown(void)
                 i++;
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
                        "Waiting for ruhroh thread to terminate\n");
-               cfs_pause(cfs_time_seconds(1));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(cfs_time_seconds(1));
         }
  
         /* Flag threads to terminate */
@@ -2496,7 +2564,8 @@ kgnilnd_base_shutdown(void)
         wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
         spin_unlock(&kgnilnd_data.kgn_reaper_lock);
  
-       kgnilnd_wakeup_rca_thread();
+       if (atomic_read(&kgnilnd_data.kgn_nthreads))
+               kgnilnd_wakeup_rca_thread();
  
         /* Wait for threads to exit */
         i = 2;
@@ -2505,7 +2574,8 @@ kgnilnd_base_shutdown(void)
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                        "Waiting for %d threads to terminate\n",
                        atomic_read(&kgnilnd_data.kgn_nthreads));
-               cfs_pause(cfs_time_seconds(1));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(cfs_time_seconds(1));
         }
  
         LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
@@ -2751,7 +2821,8 @@ kgnilnd_shutdown(lnet_ni_t *ni)
                                 "Waiting for %d references to clear on net %d\n",
                                 atomic_read(&net->gnn_refcount),
                                 net->gnn_netnum);
-                       cfs_pause(cfs_time_seconds(1));
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_timeout(cfs_time_seconds(1));
                 }
  
                 /* release ref from kgnilnd_startup */