X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fgnilnd%2Fgnilnd_stack.c;h=1da0fbe90d68ff94b8d099b76c076b0313c8ba8d;hb=31be9f94d72287d7077ffc00d392a71056f61f4d;hp=10ae493053c4cd0bca836209e6b75aaa3b24efab;hpb=4d381ef9f179b21217c237ad1cc83055a2448550;p=fs%2Flustre-release.git diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c index 10ae493..1da0fbe 100644 --- a/lnet/klnds/gnilnd/gnilnd_stack.c +++ b/lnet/klnds/gnilnd/gnilnd_stack.c @@ -1,6 +1,8 @@ /* * Copyright (C) 2012 Cray, Inc. * + * Copyright (c) 2014, Intel Corporation. + * * Author: Nic Henke * * This file is part of Lustre, http://www.lustre.org. @@ -20,7 +22,9 @@ * */ #include "gnilnd.h" - +#if defined(GNILND_USE_RCA) +#include +#endif /* Advance all timeouts by nap_time seconds. */ void kgnilnd_bump_timeouts(__u32 nap_time, char *reason) @@ -32,7 +36,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason) kgn_device_t *dev; kgn_dgram_t *dgram; - LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time); + CDEBUG(D_INFO, "%s: bumping all timeouts by %ds\n", reason, nap_time); LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n", atomic_read(&kgnilnd_data.kgn_nquiesce), @@ -54,6 +58,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason) peer->gnp_reconnect_interval = 0; /* tell LNet dude is still alive */ kgnilnd_peer_alive(peer); + kgnilnd_peer_notify(peer, 0, 1); list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) { tx->tx_qtime = jiffies; @@ -73,6 +78,7 @@ kgnilnd_bump_timeouts(__u32 nap_time, char *reason) * we'll back it up and schedule the conn to trigger * a NOOP */ conn->gnc_last_tx = jiffies - timeout; + if (conn->gnc_state != GNILND_CONN_DONE) kgnilnd_schedule_conn(conn); } } @@ -112,32 +118,34 @@ kgnilnd_quiesce_wait(char *reason) wake_up_all(&dev->gnd_dgping_waitq); } + kgnilnd_wakeup_rca_thread(); + /* we'll wait for 10x the timeout for the threads to pause */ quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10); quiesce_deadline = (long) jiffies + quiesce_to; + LCONSOLE_INFO("Quiesce start: %s\n", reason); /* wait for everyone to check-in as quiesced */ - i = 1; while (!GNILND_IS_QUIESCED) { - i++; - LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + CDEBUG(D_INFO, "%s: Waiting for %d threads to pause\n", reason, atomic_read(&kgnilnd_data.kgn_nthreads) - atomic_read(&kgnilnd_data.kgn_nquiesce)); CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE); - cfs_pause(cfs_time_seconds(1 * i)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1 * i)); LASSERTF(quiesce_deadline > jiffies, "couldn't quiesce threads in %lu seconds, falling over now\n", cfs_duration_sec(quiesce_to)); } - LCONSOLE_WARN("%s: All threads paused!\n", reason); + CDEBUG(D_INFO, "%s: All threads paused!\n", reason); /* XXX Nic: Is there a set of counters we can grab here to * ensure that there is no traffic until quiesce is over ?*/ } else { - /* GO! GO! GO! */ + LCONSOLE_INFO("Quiesce complete: %s\n", reason); for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; @@ -146,17 +154,16 @@ kgnilnd_quiesce_wait(char *reason) /* wait for everyone to check-in as running - they will be spinning * and looking, so no need to poke any waitq */ - i = 1; while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) { - i++; - LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + CDEBUG(D_INFO, "%s: Waiting for %d threads to wake up\n", reason, atomic_read(&kgnilnd_data.kgn_nquiesce)); - cfs_pause(cfs_time_seconds(1 * i)); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1 * i)); } - LCONSOLE_WARN("%s: All threads awake!\n", reason); + CDEBUG(D_INFO, "%s: All threads awake!\n", reason); } } @@ -294,7 +301,7 @@ kgnilnd_reset_stack(void) /* now all the cons/mboxes should be cleaned up, including purgatory * so go through and release the MDDs for our persistent PHYS fma_blks */ - kgnilnd_unmap_phys_fmablk(dev); + kgnilnd_unmap_fma_blocks(dev); LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0, "reset failed: fma blocks still live %d\n", @@ -362,7 +369,6 @@ kgnilnd_ruhroh_thread(void *arg) int i = 1; DEFINE_WAIT(wait); - cfs_daemonize("kgnilnd_rr"); cfs_block_allsigs(); set_user_nice(current, *kgnilnd_tunables.kgn_nice); kgnilnd_data.kgn_ruhroh_running = 1; @@ -383,7 +389,7 @@ kgnilnd_ruhroh_thread(void *arg) break; /* Serialize with driver startup and shutdown. */ - down(&kgnilnd_data.kgn_quiesce_sem); + mutex_lock(&kgnilnd_data.kgn_quiesce_mutex); CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n", kgnilnd_data.kgn_quiesce_trigger, @@ -396,7 +402,7 @@ kgnilnd_ruhroh_thread(void *arg) /* Pause all other kgnilnd threads. */ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE); - kgnilnd_quiesce_wait("hardware quiesce flag"); + kgnilnd_quiesce_wait("hardware quiesce"); /* If the hardware quiesce flag is set, wait for it to clear. * This should happen relatively quickly, so we wait for it. @@ -411,9 +417,10 @@ kgnilnd_ruhroh_thread(void *arg) while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) { i++; - LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, - "Waiting for hardware quiesce flag to clear\n"); - cfs_pause(cfs_time_seconds(1 * i)); + CDEBUG(D_INFO, "Waiting for hardware quiesce " + "flag to clear\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1 * i)); /* If we got a quiesce event with bump info, DO THE BUMP!. */ if (kgnilnd_data.kgn_bump_info_rdy) { @@ -448,7 +455,7 @@ kgnilnd_ruhroh_thread(void *arg) set_mb(kgnilnd_data.kgn_needs_reset, 0); } - up(&kgnilnd_data.kgn_quiesce_sem); + mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex); } kgnilnd_data.kgn_ruhroh_running = 0; @@ -562,3 +569,247 @@ kgnilnd_critical_error(struct gni_err *err_handle) CDEBUG(D_NET, "stack reset bypassed because of shutdown\n"); } } + +#if defined(GNILND_USE_RCA) +#include +#define RCA_EVENTS 3 +/* RCA ticket is needed for krca_wakeup_wait_event() */ +static krca_ticket_t rca_krt = KRCA_NULL_TICKET; +struct rcadata { + rca_ticket_t ticket; + int subscribed; + rs_event_code_t ec; +}; +static struct rcadata rd[RCA_EVENTS] = { + {0, 0, ec_node_unavailable}, + {0, 0, ec_node_available}, + {0, 0, ec_node_failed} +}; + +/* thread for receiving rca events */ +int +kgnilnd_rca(void *arg) +{ + int i, rc; + int retry_count; + rs_event_t event; + lnet_nid_t nid; + + cfs_block_allsigs(); + + /* all gnilnd threads need to run fairly urgently */ + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + + /* + * Register our service with RCA and subscribe to events + * of interest. + */ + rca_krt = KRCA_NULL_TICKET; + rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0); + if (rc < 0) { + CNETERR("krca_register(%x) returned %d\n", current->pid, rc); + goto done; + } + + for (i = 0; i < RCA_EVENTS; i++) { + retry_count = 0; +subscribe_retry: + rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY, + &rd[i].ticket); + + if ((rc == -EINTR) && !retry_count) { + retry_count++; + CNETERR("krca_subscribe returned %d - retrying\n", rc); + goto subscribe_retry; + } + + if (rc < 0) { + CNETERR("rca subscription failed (%d)\n", rc); + goto done; + } + + rd[i].subscribed = 1; + } + + while (!kgnilnd_data.kgn_shutdown) { + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + KGNILND_SPIN_QUIESCE; + } + /* wait here for a subscribed event */ + rc = krca_wait_event(&rca_krt); + + /* RCA return values: + * 0 indicates krca_wakeup_wait_event caused krca_wait_event + * return. + * -ERESTARTSYS indicates krca_wait_event returned because of a + * signal. + * -ENOSPC indicates no space available to create an rcad_reg_t + * 1 indicates a message is waiting. + */ + if (rc <= 0) { + continue; + } + + if (krca_get_message(&rca_krt, &event) == 0) { + int node_down = GNILND_RCA_NODE_UNKNOWN; + rs_state_t state; + LIST_HEAD(zombies); + + /* Compute nodes don't care about other compute nodes + * so we don't need to create a peer. + */ + if (GNILND_COMPUTE && + !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat, + IS_SVC)) { + continue; + } + + /* Only care about compute and service nodes not GPUs */ + if (!(RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat, + TYPE) == rt_node || + RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat, + TYPE) == rt_accel)) { + continue; + } + + switch (event.ev_id) { + case ec_node_available: + CDEBUG(D_INFO, "ec_node_available\n"); + node_down = GNILND_RCA_NODE_UP; + break; + case ec_node_failed: + CDEBUG(D_INFO, "ec_node_failed\n"); + if (event.ev_len > 0) { + CDEBUG(D_ERROR, + "ec_node_failed ignored\n"); + break; + } + node_down = GNILND_RCA_NODE_DOWN; + break; + case ec_node_unavailable: + state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE); + + CDEBUG(D_INFO, "ec_node_unavailable\n"); + + /* + * Ignore overloaded ec_node_unavailable events + * generated by 'xtcli set_reserve'. + */ + if (RS_GET_CS_STATE(state) == RS_CS_READY) { + CDEBUG(D_INFO, "ignoring " + "ec_node_unavailable event with" + " RS_CS_READY state\n"); + break; + } + node_down = GNILND_RCA_NODE_DOWN; + break; + default: + CDEBUG(D_INFO, "unknown event\n"); + break; + } + + /* if we get an event we don't know about, just go ahead + * and wait for another event */ + if (node_down == GNILND_RCA_NODE_UNKNOWN) { + continue; + } + + nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat, + NID); + CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n", + (int)nid, node_down ? "down" : "up"); + kgnilnd_report_node_state(nid, node_down); + + } else { + CNETERR("krca_get_message failed\n"); + } + } + +done: + CDEBUG(D_INFO, "done\n"); + + for (i = 0; i < RCA_EVENTS; i++) { + if (rd[i].subscribed) { + rc = krca_unsubscribe(&rca_krt, rd[i].ticket); + + if (rc) { + CNETERR("rca unsubscribe failed (%d)\n", rc); + } + + rd[i].subscribed = 0; + } + } + + krca_unregister(&rca_krt); + kgnilnd_thread_fini(); + return 0; + +} + +int +kgnilnd_start_rca_thread(void) +{ + return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0); +} + +void +kgnilnd_wakeup_rca_thread(void) +{ + int ret; + + ret = krca_wakeup_wait_event(&rca_krt); + + if (ret) { + CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n"); + } +} + +int +kgnilnd_get_node_state(__u32 nid) +{ + int i; + int rc = GNILND_RCA_NODE_UNKNOWN; + int ret; + rs_node_array_t nlist; + rs_node_t *na = NULL; + + if ((ret = krca_get_sysnodes(&nlist)) < 0) { + CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret); + goto ns_done; + } + + na = nlist.na_ids; + + for (i = 0; i < nlist.na_len; i++) { + if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) { + rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ? + GNILND_RCA_NODE_UP : GNILND_RCA_NODE_DOWN; + break; + } + } + +ns_done: + kfree(na); + CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc); + return rc; +} + +#else /* GNILND_USE_RCA */ + +int +kgnilnd_start_rca_thread(void) +{ + return 0; +} + +void +kgnilnd_wakeup_rca_thread(void) +{ +} + +int +kgnilnd_get_node_state(__u32 nid) +{ + return GNILND_RCA_NODE_UP; +} +#endif /* GNILND_USE_RCA */