/*
* Copyright (C) 2012 Cray, Inc.
*
+ * Copyright (c) 2014, Intel Corporation.
+ *
* Author: Nic Henke <nic@cray.com>
*
* This file is part of Lustre, http://www.lustre.org.
*
*/
#include "gnilnd.h"
-
+#if defined(GNILND_USE_RCA)
+#include <rsms/rs_sm_states.h>
+#endif
/* Advance all timeouts by nap_time seconds. */
void
kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
kgn_device_t *dev;
kgn_dgram_t *dgram;
- LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+ CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time);
LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
atomic_read(&kgnilnd_data.kgn_nquiesce),
peer->gnp_reconnect_interval = 0;
/* tell LNet dude is still alive */
kgnilnd_peer_alive(peer);
+ kgnilnd_peer_notify(peer, 0, 1);
list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
tx->tx_qtime = jiffies;
* we'll back it up and schedule the conn to trigger
* a NOOP */
conn->gnc_last_tx = jiffies - timeout;
+ if (conn->gnc_state != GNILND_CONN_DONE)
kgnilnd_schedule_conn(conn);
}
}
wake_up_all(&dev->gnd_dgping_waitq);
}
+ kgnilnd_wakeup_rca_thread();
+
/* we'll wait for 10x the timeout for the threads to pause */
quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
quiesce_deadline = (long) jiffies + quiesce_to;
+ LCONSOLE_INFO("Quiesce start: %s\n", reason);
/* wait for everyone to check-in as quiesced */
- i = 1;
while (!GNILND_IS_QUIESCED) {
- i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ CDEBUG(D_INFO,
"%s: Waiting for %d threads to pause\n",
reason,
atomic_read(&kgnilnd_data.kgn_nthreads) -
atomic_read(&kgnilnd_data.kgn_nquiesce));
CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
- cfs_pause(cfs_time_seconds(1 * i));
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1 * i));
LASSERTF(quiesce_deadline > jiffies,
"couldn't quiesce threads in %lu seconds, falling over now\n",
cfs_duration_sec(quiesce_to));
}
- LCONSOLE_WARN("%s: All threads paused!\n", reason);
+ CDEBUG(D_INFO, "%s: All threads paused!\n", reason);
/* XXX Nic: Is there a set of counters we can grab here to
* ensure that there is no traffic until quiesce is over ?*/
} else {
- /* GO! GO! GO! */
+ LCONSOLE_INFO("Quiesce complete: %s\n", reason);
for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
/* wait for everyone to check-in as running - they will be spinning
* and looking, so no need to poke any waitq */
- i = 1;
while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
- i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ CDEBUG(D_INFO,
"%s: Waiting for %d threads to wake up\n",
reason,
atomic_read(&kgnilnd_data.kgn_nquiesce));
- cfs_pause(cfs_time_seconds(1 * i));
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1 * i));
}
- LCONSOLE_WARN("%s: All threads awake!\n", reason);
+ CDEBUG(D_INFO, "%s: All threads awake!\n", reason);
}
}
list_del_init(&conn->gnc_schedlist);
+ if (!list_empty(&conn->gnc_delaylist))
+ list_del_init(&conn->gnc_delaylist);
+
if (conn->gnc_state == GNILND_CONN_CLOSING) {
/* bump to CLOSED to fake out send of CLOSE */
conn->gnc_state = GNILND_CONN_CLOSED;
/* now all the cons/mboxes should be cleaned up, including purgatory
* so go through and release the MDDs for our persistent PHYS fma_blks
*/
- kgnilnd_unmap_phys_fmablk(dev);
+ kgnilnd_unmap_fma_blocks(dev);
LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
"reset failed: fma blocks still live %d\n",
int i = 1;
DEFINE_WAIT(wait);
- cfs_daemonize("kgnilnd_rr");
cfs_block_allsigs();
set_user_nice(current, *kgnilnd_tunables.kgn_nice);
kgnilnd_data.kgn_ruhroh_running = 1;
break;
/* Serialize with driver startup and shutdown. */
- down(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
kgnilnd_data.kgn_quiesce_trigger,
/* Pause all other kgnilnd threads. */
set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
- kgnilnd_quiesce_wait("hardware quiesce flag");
+ kgnilnd_quiesce_wait("hardware quiesce");
/* If the hardware quiesce flag is set, wait for it to clear.
* This should happen relatively quickly, so we wait for it.
while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
i++;
- LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
- "Waiting for hardware quiesce flag to clear\n");
- cfs_pause(cfs_time_seconds(1 * i));
+ CDEBUG(D_INFO, "Waiting for hardware quiesce "
+ "flag to clear\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1 * i));
/* If we got a quiesce event with bump info, DO THE BUMP!. */
if (kgnilnd_data.kgn_bump_info_rdy) {
set_mb(kgnilnd_data.kgn_needs_reset, 0);
}
- up(&kgnilnd_data.kgn_quiesce_sem);
+ mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
}
kgnilnd_data.kgn_ruhroh_running = 0;
if (!kgnilnd_data.kgn_ruhroh_shutdown) {
- CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
+ CDEBUG(D_NET, "requesting timeout bump by %lld msecs\n", msecs);
/* Save the bump interval and request the bump.
* The memory barrier ensures that the interval is in place before
CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
}
}
+
+#if defined(GNILND_USE_RCA)
+#include <krca_lib.h>
+#define RCA_EVENTS 3
+/* RCA ticket is needed for krca_wakeup_wait_event() */
+static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
+struct rcadata {
+ rca_ticket_t ticket;
+ int subscribed;
+ rs_event_code_t ec;
+};
+static struct rcadata rd[RCA_EVENTS] = {
+ { .ec = ec_node_unavailable },
+ { .ec = ec_node_available },
+ { .ec = ec_node_failed } };
+
+/* thread for receiving rca events */
+int
+kgnilnd_rca(void *arg)
+{
+ int i, rc;
+ int retry_count;
+ rs_event_t event;
+ lnet_nid_t nid;
+
+ cfs_block_allsigs();
+
+ /* all gnilnd threads need to run fairly urgently */
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+ /*
+ * Register our service with RCA and subscribe to events
+ * of interest.
+ */
+ rca_krt = KRCA_NULL_TICKET;
+ rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
+ if (rc < 0) {
+ CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
+ goto done;
+ }
+
+ for (i = 0; i < RCA_EVENTS; i++) {
+ retry_count = 0;
+subscribe_retry:
+ rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
+ &rd[i].ticket);
+
+ if ((rc == -EINTR) && !retry_count) {
+ retry_count++;
+ CNETERR("krca_subscribe returned %d - retrying\n", rc);
+ goto subscribe_retry;
+ }
+
+ if (rc < 0) {
+ CNETERR("rca subscription failed (%d)\n", rc);
+ goto done;
+ }
+
+ rd[i].subscribed = 1;
+ }
+
+ while (!kgnilnd_data.kgn_shutdown) {
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ KGNILND_SPIN_QUIESCE;
+ }
+ /* wait here for a subscribed event */
+ rc = krca_wait_event(&rca_krt);
+
+ /* RCA return values:
+ * 0 indicates krca_wakeup_wait_event caused krca_wait_event
+ * return.
+ * -ERESTARTSYS indicates krca_wait_event returned because of a
+ * signal.
+ * -ENOSPC indicates no space available to create an rcad_reg_t
+ * 1 indicates a message is waiting.
+ */
+ if (rc <= 0) {
+ continue;
+ }
+
+ if (krca_get_message(&rca_krt, &event) == 0) {
+ int node_down = GNILND_PEER_UNKNOWN;
+ rs_state_t state;
+ LIST_HEAD(zombies);
+
+ /* Compute nodes don't care about other compute nodes
+ * so we don't need to create a peer.
+ */
+ if (GNILND_COMPUTE &&
+ !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ IS_SVC)) {
+ continue;
+ }
+
+ /* Only care about compute and service nodes not GPUs */
+ if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ TYPE) == rt_node ||
+ RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ TYPE) == rt_accel)) {
+ continue;
+ }
+
+ switch (event.ev_id) {
+ case ec_node_available:
+ CDEBUG(D_INFO, "ec_node_available\n");
+ node_down = GNILND_PEER_UP;
+ break;
+ case ec_node_failed:
+ CDEBUG(D_INFO, "ec_node_failed\n");
+ if (event.ev_len > 0) {
+ CDEBUG(D_ERROR,
+ "ec_node_failed ignored\n");
+ break;
+ }
+ node_down = GNILND_PEER_DOWN;
+ break;
+ case ec_node_unavailable:
+ state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
+
+ CDEBUG(D_INFO, "ec_node_unavailable\n");
+
+ /*
+ * Ignore overloaded ec_node_unavailable events
+ * generated by 'xtcli set_reserve'.
+ */
+ if (RS_GET_CS_STATE(state) == RS_CS_READY) {
+ CDEBUG(D_INFO, "ignoring "
+ "ec_node_unavailable event with"
+ " RS_CS_READY state\n");
+ break;
+ }
+ node_down = GNILND_PEER_DOWN;
+ break;
+ default:
+ CDEBUG(D_INFO, "unknown event\n");
+ break;
+ }
+
+ /* if we get an event we don't know about, just go ahead
+ * and wait for another event */
+ if (node_down == GNILND_PEER_UNKNOWN)
+ continue;
+
+ nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
+ NID);
+ CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
+ (int)nid, node_down ? "down" : "up");
+ kgnilnd_report_node_state(nid, node_down);
+
+ } else {
+ CNETERR("krca_get_message failed\n");
+ }
+ }
+
+done:
+ CDEBUG(D_INFO, "done\n");
+
+ for (i = 0; i < RCA_EVENTS; i++) {
+ if (rd[i].subscribed) {
+ rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
+
+ if (rc) {
+ CNETERR("rca unsubscribe failed (%d)\n", rc);
+ }
+
+ rd[i].subscribed = 0;
+ }
+ }
+
+ krca_unregister(&rca_krt);
+ kgnilnd_thread_fini();
+ return 0;
+
+}
+
+int
+kgnilnd_start_rca_thread(void)
+{
+ return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
+}
+
+void
+kgnilnd_wakeup_rca_thread(void)
+{
+ int ret;
+
+ ret = krca_wakeup_wait_event(&rca_krt);
+
+ if (ret) {
+ CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
+ }
+}
+
+int
+kgnilnd_get_node_state(__u32 nid)
+{
+ int i;
+ int rc = GNILND_PEER_UNKNOWN;
+ int ret;
+ rs_node_array_t nlist;
+ rs_node_t *na = NULL;
+
+ if ((ret = krca_get_sysnodes(&nlist)) < 0) {
+ CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret);
+ goto ns_done;
+ }
+
+ na = nlist.na_ids;
+
+ for (i = 0; i < nlist.na_len; i++) {
+ if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
+ rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
+ GNILND_PEER_UP : GNILND_PEER_DOWN;
+ break;
+ }
+ }
+
+ns_done:
+ kfree(na);
+ CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc);
+ return rc;
+}
+
+#else /* GNILND_USE_RCA */
+
+int
+kgnilnd_start_rca_thread(void)
+{
+ return 0;
+}
+
+void
+kgnilnd_wakeup_rca_thread(void)
+{
+}
+
+int
+kgnilnd_get_node_state(__u32 nid)
+{
+ return GNILND_PEER_UP;
+}
+#endif /* GNILND_USE_RCA */