Whamcloud - gitweb
LU-12931 gnilnd: use time_after() to compare jiffies
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_stack.c
index 10ae493..bab50cb 100644 (file)
@@ -1,6 +1,8 @@
 /*
  * Copyright (C) 2012 Cray, Inc.
  *
+ * Copyright (c) 2014, Intel Corporation.
+ *
  *   Author: Nic Henke <nic@cray.com>
  *
  *   This file is part of Lustre, http://www.lustre.org.
@@ -20,7 +22,9 @@
  *
  */
 #include "gnilnd.h"
-
+#if defined(GNILND_USE_RCA)
+#include <rsms/rs_sm_states.h>
+#endif
 /* Advance all timeouts by nap_time seconds. */
 void
 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
@@ -32,7 +36,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
        kgn_device_t           *dev;
        kgn_dgram_t            *dgram;
 
-       LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+       CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
 
        LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
                 atomic_read(&kgnilnd_data.kgn_nquiesce),
@@ -54,6 +58,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
                        peer->gnp_reconnect_interval = 0;
                        /* tell LNet dude is still alive */
                        kgnilnd_peer_alive(peer);
+                       kgnilnd_peer_notify(peer, 0, 1);
 
                        list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
                                tx->tx_qtime = jiffies;
@@ -73,6 +78,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
                                 * we'll back it up and schedule the conn to trigger
                                 * a NOOP */
                                conn->gnc_last_tx = jiffies - timeout;
+                               if (conn->gnc_state != GNILND_CONN_DONE)
                                kgnilnd_schedule_conn(conn);
                        }
                }
@@ -112,32 +118,33 @@ kgnilnd_quiesce_wait(char *reason)
                        wake_up_all(&dev->gnd_dgping_waitq);
                }
 
+               kgnilnd_wakeup_rca_thread();
+
                /* we'll wait for 10x the timeout for the threads to pause */
                quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
                quiesce_deadline = (long) jiffies + quiesce_to;
 
+               LCONSOLE_INFO("Quiesce start: %s\n", reason);
                /* wait for everyone to check-in as quiesced */
-               i = 1;
                while (!GNILND_IS_QUIESCED) {
-                       i++;
-                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                       CDEBUG(D_INFO,
                                 "%s: Waiting for %d threads to pause\n",
                                 reason,
                                 atomic_read(&kgnilnd_data.kgn_nthreads) -
                                 atomic_read(&kgnilnd_data.kgn_nquiesce));
                        CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
-                       cfs_pause(cfs_time_seconds(1 * i));
+                       schedule_timeout_uninterruptible(cfs_time_seconds(i));
 
-                       LASSERTF(quiesce_deadline > jiffies,
+                       LASSERTF(time_after(quiesce_deadline, jiffies),
                                 "couldn't quiesce threads in %lu seconds, falling over now\n",
                                 cfs_duration_sec(quiesce_to));
                }
 
-               LCONSOLE_WARN("%s: All threads paused!\n", reason);
+               CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
                /* XXX Nic: Is there a set of counters we can grab here to
                 * ensure that there is no traffic until quiesce is over ?*/
        } else {
-               /* GO! GO! GO! */
+               LCONSOLE_INFO("Quiesce complete: %s\n", reason);
 
                for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
                        kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
@@ -146,17 +153,15 @@ kgnilnd_quiesce_wait(char *reason)
 
                /* wait for everyone to check-in as running - they will be spinning
                 * and looking, so no need to poke any waitq */
-               i = 1;
                while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
-                       i++;
-                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                       CDEBUG(D_INFO,
                                 "%s: Waiting for %d threads to wake up\n",
                                  reason,
                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
-                       cfs_pause(cfs_time_seconds(1 * i));
+                       schedule_timeout_uninterruptible(cfs_time_seconds(i));
                }
 
-               LCONSOLE_WARN("%s: All threads awake!\n", reason);
+               CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
        }
 }
 
@@ -247,6 +252,9 @@ kgnilnd_reset_stack(void)
 
                        list_del_init(&conn->gnc_schedlist);
 
+                       if (!list_empty(&conn->gnc_delaylist))
+                               list_del_init(&conn->gnc_delaylist); 
+
                        if (conn->gnc_state == GNILND_CONN_CLOSING) {
                                /* bump to CLOSED to fake out send of CLOSE */
                                conn->gnc_state = GNILND_CONN_CLOSED;
@@ -294,7 +302,7 @@ kgnilnd_reset_stack(void)
                /* now all the cons/mboxes should be cleaned up, including purgatory
                 * so go through and release the MDDs for our persistent PHYS fma_blks
                 */
-               kgnilnd_unmap_phys_fmablk(dev);
+               kgnilnd_unmap_fma_blocks(dev);
 
                LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
                        "reset failed: fma blocks still live %d\n",
@@ -362,8 +370,6 @@ kgnilnd_ruhroh_thread(void *arg)
        int                i = 1;
        DEFINE_WAIT(wait);
 
-       cfs_daemonize("kgnilnd_rr");
-       cfs_block_allsigs();
        set_user_nice(current, *kgnilnd_tunables.kgn_nice);
        kgnilnd_data.kgn_ruhroh_running = 1;
 
@@ -383,7 +389,7 @@ kgnilnd_ruhroh_thread(void *arg)
                        break;
 
                /* Serialize with driver startup and shutdown. */
-               down(&kgnilnd_data.kgn_quiesce_sem);
+               mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
 
               CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
                        kgnilnd_data.kgn_quiesce_trigger,
@@ -396,7 +402,7 @@ kgnilnd_ruhroh_thread(void *arg)
 
                        /* Pause all other kgnilnd threads. */
                        set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
-                       kgnilnd_quiesce_wait("hardware quiesce flag");
+                       kgnilnd_quiesce_wait("hardware quiesce");
 
                        /* If the hardware quiesce flag is set, wait for it to clear.
                         * This should happen relatively quickly, so we wait for it.
@@ -411,9 +417,10 @@ kgnilnd_ruhroh_thread(void *arg)
                        while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
 
                                i++;
-                               LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
-                                               "Waiting for hardware quiesce flag to clear\n");
-                               cfs_pause(cfs_time_seconds(1 * i));
+                               CDEBUG(D_INFO, "Waiting for hardware quiesce "
+                                              "flag to clear\n");
+                               schedule_timeout_uninterruptible(
+                                       cfs_time_seconds(i));
 
                                /* If we got a quiesce event with bump info, DO THE BUMP!. */
                                if (kgnilnd_data.kgn_bump_info_rdy) {
@@ -448,7 +455,7 @@ kgnilnd_ruhroh_thread(void *arg)
                        set_mb(kgnilnd_data.kgn_needs_reset, 0);
                }
 
-               up(&kgnilnd_data.kgn_quiesce_sem);
+               mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
        }
 
        kgnilnd_data.kgn_ruhroh_running = 0;
@@ -528,7 +535,7 @@ kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
 
        if (!kgnilnd_data.kgn_ruhroh_shutdown) {
 
-               CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
+               CDEBUG(D_NET, "requesting timeout bump by %lld msecs\n", msecs);
 
                /* Save the bump interval and request the bump.
                 * The memory barrier ensures that the interval is in place before
@@ -562,3 +569,243 @@ kgnilnd_critical_error(struct gni_err *err_handle)
                CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
        }
 }
+
+#if defined(GNILND_USE_RCA)
+#include <krca_lib.h>
+#define RCA_EVENTS 3
+/* RCA ticket is needed for krca_wakeup_wait_event() */
+static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
+struct rcadata {
+       rca_ticket_t ticket;
+       int subscribed;
+       rs_event_code_t ec;
+};
+static struct rcadata rd[RCA_EVENTS] = {
+       { .ec = ec_node_unavailable },
+       { .ec = ec_node_available },
+       { .ec = ec_node_failed } };
+
+/* thread for receiving rca events */
+int
+kgnilnd_rca(void *arg)
+{
+       int        i, rc;
+       int        retry_count;
+       rs_event_t event;
+       lnet_nid_t nid;
+
+       /* all gnilnd threads need to run fairly urgently */
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+       /*
+        * Register our service with RCA and subscribe to events
+        * of interest.
+        */
+       rca_krt = KRCA_NULL_TICKET;
+       rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
+       if (rc < 0) {
+               CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
+               goto done;
+       }
+
+       for (i = 0; i < RCA_EVENTS; i++) {
+               retry_count = 0;
+subscribe_retry:
+               rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
+                                   &rd[i].ticket);
+
+               if ((rc == -EINTR) && !retry_count) {
+                       retry_count++;
+                       CNETERR("krca_subscribe returned %d - retrying\n", rc);
+                       goto subscribe_retry;
+               }
+
+               if (rc < 0) {
+                       CNETERR("rca subscription failed (%d)\n", rc);
+                       goto done;
+               }
+
+               rd[i].subscribed = 1;
+       }
+
+       while (!kgnilnd_data.kgn_shutdown) {
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       KGNILND_SPIN_QUIESCE;
+               }
+               /* wait here for a subscribed event */
+               rc = krca_wait_event(&rca_krt);
+
+               /* RCA return values:
+                * 0 indicates krca_wakeup_wait_event caused krca_wait_event
+                *   return.
+                * -ERESTARTSYS indicates krca_wait_event returned because of a
+                *   signal.
+                * -ENOSPC indicates no space available to create an rcad_reg_t
+                * 1 indicates a message is waiting.
+                */
+               if (rc <= 0) {
+                       continue;
+               }
+
+               if (krca_get_message(&rca_krt, &event) == 0) {
+                       int node_down = GNILND_PEER_UNKNOWN;
+                       rs_state_t state;
+                       LIST_HEAD(zombies);
+
+                       /* Compute nodes don't care about other compute nodes
+                        * so we don't need to create a peer.
+                        */
+                       if (GNILND_COMPUTE &&
+                           !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                       IS_SVC)) {
+                               continue;
+                       }
+
+                       /* Only care about compute and service nodes not GPUs */
+                       if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                       TYPE) == rt_node ||
+                            RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                       TYPE) == rt_accel)) {
+                                               continue;
+                       }
+
+                       switch (event.ev_id) {
+                       case ec_node_available:
+                               CDEBUG(D_INFO, "ec_node_available\n");
+                               node_down = GNILND_PEER_UP;
+                               break;
+                       case ec_node_failed:
+                               CDEBUG(D_INFO, "ec_node_failed\n");
+                               if (event.ev_len > 0) {
+                                       CDEBUG(D_ERROR,
+                                               "ec_node_failed ignored\n");
+                                       break;
+                               }
+                               node_down = GNILND_PEER_DOWN;
+                               break;
+                       case ec_node_unavailable:
+                               state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
+
+                               CDEBUG(D_INFO, "ec_node_unavailable\n");
+
+                               /*
+                                * Ignore overloaded ec_node_unavailable events
+                                * generated by 'xtcli set_reserve'.
+                                */
+                               if (RS_GET_CS_STATE(state) == RS_CS_READY) {
+                                       CDEBUG(D_INFO, "ignoring "
+                                               "ec_node_unavailable event with"
+                                               " RS_CS_READY state\n");
+                                       break;
+                               }
+                               node_down = GNILND_PEER_DOWN;
+                               break;
+                       default:
+                               CDEBUG(D_INFO, "unknown event\n");
+                               break;
+                       }
+
+                       /* if we get an event we don't know about, just go ahead
+                        * and wait for another event */
+                       if (node_down == GNILND_PEER_UNKNOWN)
+                               continue;
+
+                       nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+                                         NID);
+                       CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
+                              (int)nid, node_down ? "down" : "up");
+                       kgnilnd_report_node_state(nid, node_down);
+
+               } else {
+                       CNETERR("krca_get_message failed\n");
+               }
+       }
+
+done:
+       CDEBUG(D_INFO, "done\n");
+
+       for (i = 0; i < RCA_EVENTS; i++) {
+               if (rd[i].subscribed) {
+                       rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
+
+                       if (rc) {
+                               CNETERR("rca unsubscribe failed (%d)\n", rc);
+                       }
+
+                       rd[i].subscribed = 0;
+               }
+       }
+
+       krca_unregister(&rca_krt);
+       kgnilnd_thread_fini();
+       return 0;
+
+}
+
+int
+kgnilnd_start_rca_thread(void)
+{
+       return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
+}
+
+void
+kgnilnd_wakeup_rca_thread(void)
+{
+       int ret;
+
+       ret = krca_wakeup_wait_event(&rca_krt);
+
+       if (ret) {
+               CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
+       }
+}
+
+int
+kgnilnd_get_node_state(__u32 nid)
+{
+       int i;
+       int rc = GNILND_PEER_UNKNOWN;
+       int ret;
+       rs_node_array_t nlist;
+       rs_node_t       *na = NULL;
+
+       if ((ret = krca_get_sysnodes(&nlist)) < 0) {
+               CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret);
+               goto ns_done;
+       }
+
+       na = nlist.na_ids;
+
+       for (i = 0; i < nlist.na_len; i++) {
+               if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
+                       rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
+                               GNILND_PEER_UP : GNILND_PEER_DOWN;
+                       break;
+               }
+       }
+
+ns_done:
+       kfree(na);
+       CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc);
+       return rc;
+}
+
+#else /* GNILND_USE_RCA */
+
+int
+kgnilnd_start_rca_thread(void)
+{
+       return 0;
+}
+
+void
+kgnilnd_wakeup_rca_thread(void)
+{
+}
+
+int
+kgnilnd_get_node_state(__u32 nid)
+{
+       return GNILND_PEER_UP;
+}
+#endif /* GNILND_USE_RCA */