2 * Copyright (C) 2012 Cray, Inc.
4 * Author: Nic Henke <nic@cray.com>
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 /* Advance all timeouts by nap_time seconds. */
26 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
35 LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
37 LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
38 atomic_read(&kgnilnd_data.kgn_nquiesce),
39 atomic_read(&kgnilnd_data.kgn_nthreads));
41 /* requiring that the threads are paused ensures a couple of things:
42 * - combined code paths for stack reset and quiesce event as stack reset
43 * runs with the threads paused
44 * - prevents traffic to the Gemini during a quiesce period
45 * - reduces the locking requirements
48 for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
49 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
51 /* we can reconnect again at any time */
52 peer->gnp_reconnect_time = jiffies;
53 /* reset now that network is healthy */
54 peer->gnp_reconnect_interval = 0;
55 /* tell LNet dude is still alive */
56 kgnilnd_peer_alive(peer);
58 list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
59 tx->tx_qtime = jiffies;
62 list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
63 unsigned long timeout;
65 timeout = cfs_time_seconds(conn->gnc_timeout);
67 /* bump last_rx/last_rx_cq on all conns - including
68 * closed ones, this will have the effect of
69 * bumping the purgatory timers for those */
70 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
72 /* we don't timeout based on old gnc_last_tx, so
73 * we'll back it up and schedule the conn to trigger
75 conn->gnc_last_tx = jiffies - timeout;
76 kgnilnd_schedule_conn(conn);
81 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
82 dev = &kgnilnd_data.kgn_devices[i];
83 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
84 list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
85 dgram->gndg_post_time = jiffies;
91 /* Quiesce or wake up the stack. The caller must hold the kgn_quiesce_sem semaphore
92 * on entry, which holds off any pending stack shutdown. */
94 kgnilnd_quiesce_wait(char *reason)
98 if (kgnilnd_data.kgn_quiesce_trigger) {
99 unsigned long quiesce_deadline, quiesce_to;
102 /* morning sunshine */
103 spin_lock(&kgnilnd_data.kgn_reaper_lock);
104 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
105 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
107 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
108 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
110 wake_up_all(&dev->gnd_waitq);
111 wake_up_all(&dev->gnd_dgram_waitq);
112 wake_up_all(&dev->gnd_dgping_waitq);
115 /* we'll wait for 10x the timeout for the threads to pause */
116 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
117 quiesce_deadline = (long) jiffies + quiesce_to;
119 /* wait for everyone to check-in as quiesced */
121 while (!GNILND_IS_QUIESCED) {
123 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
124 "%s: Waiting for %d threads to pause\n",
126 atomic_read(&kgnilnd_data.kgn_nthreads) -
127 atomic_read(&kgnilnd_data.kgn_nquiesce));
128 CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
129 cfs_pause(cfs_time_seconds(1 * i));
131 LASSERTF(quiesce_deadline > jiffies,
132 "couldn't quiesce threads in %lu seconds, falling over now\n",
133 cfs_duration_sec(quiesce_to));
136 LCONSOLE_WARN("%s: All threads paused!\n", reason);
137 /* XXX Nic: Is there a set of counters we can grab here to
138 * ensure that there is no traffic until quiesce is over ?*/
142 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
143 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
144 kgnilnd_schedule_dgram(dev);
147 /* wait for everyone to check-in as running - they will be spinning
148 * and looking, so no need to poke any waitq */
150 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
152 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
153 "%s: Waiting for %d threads to wake up\n",
155 atomic_read(&kgnilnd_data.kgn_nquiesce));
156 cfs_pause(cfs_time_seconds(1 * i));
159 LCONSOLE_WARN("%s: All threads awake!\n", reason);
163 /* Reset the stack. */
165 kgnilnd_reset_stack(void)
169 kgn_peer_t *peer, *peerN;
171 char *reason = "critical hardware error";
173 unsigned long start, end;
176 /* Race with del_peer and its atomics */
177 CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
179 if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
180 CERROR("can't reset the stack, gnilnd is not initialized\n");
184 /* First make sure we are not already quiesced - we panic if so,
185 * as that could leave software in a bad state */
186 LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
187 "can't reset the stack, already doing so: trigger %d\n",
188 kgnilnd_data.kgn_quiesce_trigger);
190 set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
192 /* wake up the dgram waitq thread - but after trigger set to make sure it
193 * goes into quiesce */
194 CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
195 /* same for scheduler that is dropping state transitiosn */
196 CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
197 CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
199 kgnilnd_quiesce_wait(reason);
203 kgnilnd_data.kgn_in_reset = 1;
204 kgnilnd_data.kgn_nresets++;
205 LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
206 reason, kgnilnd_data.kgn_nresets);
208 for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
209 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
210 rc = kgnilnd_cancel_net_dgrams(net);
211 LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
215 /* error -ENOTRECOVERABLE is stack reset */
216 kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
218 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
219 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
220 kgnilnd_cancel_wc_dgrams(dev);
221 kgnilnd_wait_for_canceled_dgrams(dev);
224 /* manually do some conn processing ala kgnilnd_process_conns */
225 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
226 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
230 /* go find all the closed conns that need to be nuked - the
231 * scheduler thread isn't running to do this for us */
233 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
234 kgnilnd_count_list(&dev->gnd_ready_conns));
236 /* use while/list_first_entry loop to ensure we can handle any
237 * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
238 while (!list_empty(&dev->gnd_ready_conns)) {
239 conn = list_first_entry(&dev->gnd_ready_conns,
240 kgn_conn_t, gnc_schedlist);
241 conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
243 LASSERTF(conn_sched != GNILND_CONN_IDLE &&
244 conn_sched != GNILND_CONN_PROCESS,
245 "conn %p on ready list but in bad state: %d\n",
248 list_del_init(&conn->gnc_schedlist);
250 if (conn->gnc_state == GNILND_CONN_CLOSING) {
251 /* bump to CLOSED to fake out send of CLOSE */
252 conn->gnc_state = GNILND_CONN_CLOSED;
253 conn->gnc_close_sent = 1;
256 if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
257 kgnilnd_destroy_conn_ep(conn);
259 kgnilnd_complete_closed_conn(conn);
262 /* there really shouldn't be any other states here -
263 * they would have been cleared out in the del_peer_or_conn or the dgram
265 * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
266 * care of catching anything else for us */
268 kgnilnd_schedule_process_conn(conn, -1);
270 kgnilnd_conn_decref(conn);
274 /* don't let the little weasily purgatory conns hide from us */
275 for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
276 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
277 kgn_conn_t *conn, *connN;
279 list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
280 kgnilnd_detach_purgatory_locked(conn, &souls);
285 CDEBUG(D_NET, "about to release %d purgatory entries\n",
286 kgnilnd_count_list(&souls));
288 kgnilnd_release_purgatory_list(&souls);
290 /* validate we are now clean */
291 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
292 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
294 /* now all the cons/mboxes should be cleaned up, including purgatory
295 * so go through and release the MDDs for our persistent PHYS fma_blks
297 kgnilnd_unmap_phys_fmablk(dev);
299 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
300 "reset failed: fma blocks still live %d\n",
301 atomic_read(&dev->gnd_nfmablk));
303 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
304 "reset failed: EP handles still live %d\n",
305 atomic_read(&dev->gnd_neps));
308 LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
309 "reset failed: conns left %d\n",
310 atomic_read(&kgnilnd_data.kgn_nconns));
312 /* fine to have peers left - they are waiting for new conns
313 * but should not be holding any open HW resources */
315 /* like the last part of kgnilnd_base_shutdown() */
317 CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
319 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
320 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
323 /* no need to free and recreate the TX descriptors
324 * we nuked all the ones that could be using HW resources in
325 * kgnilnd_close_matching_conns and asserted it worked in
326 * kgnilnd_dev_fini */
328 /* At this point, all HW is torn down, start to reset */
330 /* only reset our known devs */
331 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
332 kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
333 rc = kgnilnd_dev_init(dev);
334 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
335 kgnilnd_map_phys_fmablk(dev);
336 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
337 rc = kgnilnd_setup_wildcard_dgram(dev);
338 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
342 /* Now the fun restarts... - release the hounds! */
345 seconds = cfs_duration_sec((long)end - start);
346 kgnilnd_bump_timeouts(seconds, reason);
348 kgnilnd_data.kgn_in_reset = 0;
349 set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
350 kgnilnd_quiesce_wait(reason);
351 LCONSOLE_WARN("%s reset of all hardware resources\n",
352 rc ? "failed" : "successful");
357 /* A thread that handles quiece and reset hardware events.
358 * We do the same thing regardless of which device reported the event. */
360 kgnilnd_ruhroh_thread(void *arg)
365 cfs_daemonize("kgnilnd_rr");
367 set_user_nice(current, *kgnilnd_tunables.kgn_nice);
368 kgnilnd_data.kgn_ruhroh_running = 1;
372 /* Block until there's a request.. A reset request could come in
373 * while we're handling a quiesce one, or vice versa.
374 * Keep processing requests until there are none.*/
375 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
376 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
377 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
379 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
381 /* Exit if the driver is shutting down. */
382 if (kgnilnd_data.kgn_ruhroh_shutdown)
385 /* Serialize with driver startup and shutdown. */
386 down(&kgnilnd_data.kgn_quiesce_sem);
388 CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
389 kgnilnd_data.kgn_quiesce_trigger,
390 kgnilnd_data.kgn_needs_reset,
391 kgnilnd_data.kgn_bump_info_rdy,
392 kgnilnd_data.kgn_needs_pause);
394 /* Do we need to do a pause/quiesce? */
395 if (kgnilnd_data.kgn_needs_pause) {
397 /* Pause all other kgnilnd threads. */
398 set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
399 kgnilnd_quiesce_wait("hardware quiesce flag");
401 /* If the hardware quiesce flag is set, wait for it to clear.
402 * This should happen relatively quickly, so we wait for it.
403 * This will hold up the eventd thread, but on everything but
404 * the simulator, this is ok-- there is one thread per core.
406 * Handle (possibly multiple) quiesce events while we wait. The
407 * memory barrier ensures that the core doesn't start fetching
408 * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
409 * matches the second mb in kgnilnd_quiesce_end_callback(). */
411 while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
414 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
415 "Waiting for hardware quiesce flag to clear\n");
416 cfs_pause(cfs_time_seconds(1 * i));
418 /* If we got a quiesce event with bump info, DO THE BUMP!. */
419 if (kgnilnd_data.kgn_bump_info_rdy) {
420 /* reset console rate limiting for each event */
423 /* Make sure the core doesn't start fetching
424 * kgni_quiesce_seconds until after it sees
425 * kgn_bump_info_rdy set. This is the match to the
426 * first mb in kgnilnd_quiesce_end_callback(). */
428 (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
429 "hardware quiesce callback");
430 set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
431 set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
435 /* Reset the kgn_needs_pause flag before coming out of
436 * the pause. This ordering avoids a race with the
437 * setting of this flag in kgnilnd_pause_threads(). */
438 set_mb(kgnilnd_data.kgn_needs_pause, 0);
440 /* ok, let the kids back into the pool */
441 set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
442 kgnilnd_quiesce_wait("hardware quiesce");
445 /* Do a stack reset if needed. */
446 if (kgnilnd_data.kgn_needs_reset) {
447 kgnilnd_reset_stack();
448 set_mb(kgnilnd_data.kgn_needs_reset, 0);
451 up(&kgnilnd_data.kgn_quiesce_sem);
454 kgnilnd_data.kgn_ruhroh_running = 0;
458 /* Set pause request flag. Any functions that
459 * call this one are responsible for ensuring that
460 * variables they set up are visible on other cores before
461 * this flag setting. This executes in interrupt or kernel
464 kgnilnd_pause_threads(void)
466 /* only device 0 gets the handle, see kgnilnd_dev_init */
467 kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
468 LASSERTF(dev != NULL, "dev 0 is NULL\n");
470 /* If we're currently in a pause triggered by the pause flag,
471 * there's no need to set it again. We clear the kgn_needs_pause
472 * flag before we reset kgn_quiesce_trigger to avoid a race. The
473 * read memory barrier matches the setmb() on the trigger in
474 * kgnilnd_ruhroh_task(). */
476 if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
477 GNILND_IS_QUIESCED)) {
478 CDEBUG(D_NET, "requesting thread pause\n");
480 kgnilnd_data.kgn_needs_pause = 1;
482 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
484 CDEBUG(D_NET, "thread pause already underway\n");
488 /* Return non-zero if the GNI hardware quiesce flag is set */
490 kgnilnd_hw_in_quiesce(void)
492 /* only device 0 gets the handle, see kgnilnd_dev_init */
493 kgn_device_t *dev0 = &kgnilnd_data.kgn_devices[0];
495 LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
498 return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
502 /* If the GNI hardware quiesce flag is set, initiate our pause and
503 * return non-zero. Also return non-zero if the stack is shutting down. */
505 kgnilnd_check_hw_quiesce(void)
507 if (likely(!kgnilnd_hw_in_quiesce()))
510 if (!kgnilnd_data.kgn_ruhroh_shutdown) {
511 CDEBUG(D_NET, "initiating thread pause\n");
512 kgnilnd_pause_threads();
514 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
520 /* Callback from kngi with the quiesce duration. This executes
521 * in interrupt context. */
523 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
525 /* only device 0 gets the handle, see kgnilnd_dev_init */
526 kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
527 LASSERTF(dev != NULL, "dev 0 is NULL\n");
529 if (!kgnilnd_data.kgn_ruhroh_shutdown) {
531 CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
533 /* Save the bump interval and request the bump.
534 * The memory barrier ensures that the interval is in place before
535 * the bump flag can be seen (in case a core is already running the
536 * ruhroh task), and that the bump request flag in place before
537 * the pause request can be seen (to ensure a core doesn't miss the bump
539 /* If another callback occurred before the ruhroh task
540 * finished processing the first bump request, we'd over-write its info.
541 * Nic says that callbacks occur so slowly that this isn't an issue. */
542 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
543 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
544 kgnilnd_pause_threads();
546 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
551 kgnilnd_critical_error(struct gni_err *err_handle)
553 /* only device 0 gets the handle, see kgnilnd_dev_init */
554 kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
555 LASSERTF(dev != NULL, "dev 0 is NULL\n");
557 if (!kgnilnd_data.kgn_ruhroh_shutdown) {
558 CDEBUG(D_NET, "requesting stack reset\n");
559 kgnilnd_data.kgn_needs_reset = 1;
560 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
562 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");