Whamcloud - gitweb
a502ce25e37e1a3946131baaf760e42d0aa66ddc
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_stack.c
1 /*
2  * Copyright (C) 2012 Cray, Inc.
3  *
4  *   Author: Nic Henke <nic@cray.com>
5  *
6  *   This file is part of Lustre, http://www.lustre.org.
7  *
8  *   Lustre is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Lustre is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Lustre; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  *
21  */
22 #include "gnilnd.h"
23 #if defined(GNILND_USE_RCA)
24 #include <rsms/rs_sm_states.h>
25 #endif
26 /* Advance all timeouts by nap_time seconds. */
27 void
28 kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
29 {
30         int                     i;
31         kgn_peer_t             *peer;
32         kgn_conn_t             *conn;
33         kgn_tx_t               *tx;
34         kgn_device_t           *dev;
35         kgn_dgram_t            *dgram;
36
37         LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
38
39         LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
40                  atomic_read(&kgnilnd_data.kgn_nquiesce),
41                  atomic_read(&kgnilnd_data.kgn_nthreads));
42
43         /* requiring that the threads are paused ensures a couple of things:
44          * - combined code paths for stack reset and quiesce event as stack reset
45          *   runs with the threads paused
46          * - prevents traffic to the Gemini during a quiesce period
47          * - reduces the locking requirements
48         */
49
50         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
51                 list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
52
53                         /* we can reconnect again at any time */
54                         peer->gnp_reconnect_time = jiffies;
55                         /* reset now that network is healthy */
56                         peer->gnp_reconnect_interval = 0;
57                         /* tell LNet dude is still alive */
58                         kgnilnd_peer_alive(peer);
59
60                         list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
61                                 tx->tx_qtime = jiffies;
62                         }
63
64                         list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
65                                 unsigned long           timeout;
66
67                                 timeout = cfs_time_seconds(conn->gnc_timeout);
68
69                                 /* bump last_rx/last_rx_cq on all conns - including
70                                  * closed ones, this will have the effect of
71                                  * bumping the purgatory timers for those */
72                                 conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
73
74                                 /* we don't timeout based on old gnc_last_tx, so
75                                  * we'll back it up and schedule the conn to trigger
76                                  * a NOOP */
77                                 conn->gnc_last_tx = jiffies - timeout;
78                                 if (conn->gnc_state != GNILND_CONN_DONE)
79                                 kgnilnd_schedule_conn(conn);
80                         }
81                 }
82         }
83
84         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
85                 dev = &kgnilnd_data.kgn_devices[i];
86                 for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
87                         list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
88                                 dgram->gndg_post_time = jiffies;
89                         }
90                 }
91         }
92 }
93
94 /* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
95  * on entry, which holds off any pending stack shutdown.   */
96 void
97 kgnilnd_quiesce_wait(char *reason)
98 {
99         int             i;
100
101         if (kgnilnd_data.kgn_quiesce_trigger) {
102                 unsigned long   quiesce_deadline, quiesce_to;
103                 /* FREEZE TAG!!!! */
104
105                 /* morning sunshine */
106                 spin_lock(&kgnilnd_data.kgn_reaper_lock);
107                 wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
108                 spin_unlock(&kgnilnd_data.kgn_reaper_lock);
109
110                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
111                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
112
113                         wake_up_all(&dev->gnd_waitq);
114                         wake_up_all(&dev->gnd_dgram_waitq);
115                         wake_up_all(&dev->gnd_dgping_waitq);
116                 }
117
118                 kgnilnd_wakeup_rca_thread();
119
120                 /* we'll wait for 10x the timeout for the threads to pause */
121                 quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
122                 quiesce_deadline = (long) jiffies + quiesce_to;
123
124                 /* wait for everyone to check-in as quiesced */
125                 i = 1;
126                 while (!GNILND_IS_QUIESCED) {
127                         i++;
128                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
129                                  "%s: Waiting for %d threads to pause\n",
130                                  reason,
131                                  atomic_read(&kgnilnd_data.kgn_nthreads) -
132                                  atomic_read(&kgnilnd_data.kgn_nquiesce));
133                         CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
134                         cfs_pause(cfs_time_seconds(1 * i));
135
136                         LASSERTF(quiesce_deadline > jiffies,
137                                  "couldn't quiesce threads in %lu seconds, falling over now\n",
138                                  cfs_duration_sec(quiesce_to));
139                 }
140
141                 LCONSOLE_WARN("%s: All threads paused!\n", reason);
142                 /* XXX Nic: Is there a set of counters we can grab here to
143                  * ensure that there is no traffic until quiesce is over ?*/
144         } else {
145                 /* GO! GO! GO! */
146
147                 for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
148                         kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
149                         kgnilnd_schedule_dgram(dev);
150                 }
151
152                 /* wait for everyone to check-in as running - they will be spinning
153                  * and looking, so no need to poke any waitq */
154                 i = 1;
155                 while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
156                         i++;
157                         LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
158                                  "%s: Waiting for %d threads to wake up\n",
159                                   reason,
160                                   atomic_read(&kgnilnd_data.kgn_nquiesce));
161                         cfs_pause(cfs_time_seconds(1 * i));
162                 }
163
164                 LCONSOLE_WARN("%s: All threads awake!\n", reason);
165         }
166 }
167
168 /* Reset the stack.  */
169 void
170 kgnilnd_reset_stack(void)
171 {
172         int              i, rc = 0;
173         kgn_net_t       *net;
174         kgn_peer_t      *peer, *peerN;
175         LIST_HEAD        (souls);
176         char            *reason = "critical hardware error";
177         __u32            seconds;
178         unsigned long    start, end;
179         ENTRY;
180
181         /* Race with del_peer and its atomics */
182         CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
183
184         if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
185                 CERROR("can't reset the stack, gnilnd is not initialized\n");
186                 RETURN_EXIT;
187         }
188
189         /* First make sure we are not already quiesced - we panic if so,
190          * as that could leave software in a bad state */
191         LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
192                 "can't reset the stack, already doing so: trigger %d\n",
193                  kgnilnd_data.kgn_quiesce_trigger);
194
195         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
196
197         /* wake up the dgram waitq thread - but after trigger set to make sure it
198          * goes into quiesce */
199         CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
200         /* same for scheduler that is dropping state transitiosn */
201         CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
202         CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
203
204         kgnilnd_quiesce_wait(reason);
205
206         start = jiffies;
207
208         kgnilnd_data.kgn_in_reset = 1;
209         kgnilnd_data.kgn_nresets++;
210         LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
211                       reason, kgnilnd_data.kgn_nresets);
212
213         for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
214                 list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
215                         rc = kgnilnd_cancel_net_dgrams(net);
216                         LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
217                 }
218         }
219
220         /* error -ENOTRECOVERABLE is stack reset */
221         kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
222
223         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
224                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
225                 kgnilnd_cancel_wc_dgrams(dev);
226                 kgnilnd_wait_for_canceled_dgrams(dev);
227         }
228
229         /* manually do some conn processing ala kgnilnd_process_conns */
230         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
231                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
232                 kgn_conn_t      *conn;
233                 int              conn_sched;
234
235                 /* go find all the closed conns that need to be nuked - the
236                  * scheduler thread isn't running to do this for us */
237
238                 CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
239                         kgnilnd_count_list(&dev->gnd_ready_conns));
240
241                 /* use while/list_first_entry loop to ensure we can handle any
242                  * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
243                 while (!list_empty(&dev->gnd_ready_conns)) {
244                         conn = list_first_entry(&dev->gnd_ready_conns,
245                                                 kgn_conn_t, gnc_schedlist);
246                         conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
247
248                         LASSERTF(conn_sched != GNILND_CONN_IDLE &&
249                                  conn_sched != GNILND_CONN_PROCESS,
250                                  "conn %p on ready list but in bad state: %d\n",
251                                  conn, conn_sched);
252
253                         list_del_init(&conn->gnc_schedlist);
254
255                         if (conn->gnc_state == GNILND_CONN_CLOSING) {
256                                 /* bump to CLOSED to fake out send of CLOSE */
257                                 conn->gnc_state = GNILND_CONN_CLOSED;
258                                 conn->gnc_close_sent = 1;
259                         }
260
261                         if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
262                                 kgnilnd_destroy_conn_ep(conn);
263                         } else {
264                                 kgnilnd_complete_closed_conn(conn);
265                         }
266
267                         /* there really shouldn't be any other states here -
268                          * they would have been cleared out in the del_peer_or_conn or the dgram
269                          * aborts above.
270                          * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
271                          * care of catching anything else for us */
272
273                         kgnilnd_schedule_process_conn(conn, -1);
274
275                         kgnilnd_conn_decref(conn);
276                 }
277         }
278
279         /* don't let the little weasily purgatory conns hide from us */
280         for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
281                 list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
282                         kgn_conn_t       *conn, *connN;
283
284                         list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
285                                 kgnilnd_detach_purgatory_locked(conn, &souls);
286                         }
287                 }
288         }
289
290         CDEBUG(D_NET, "about to release %d purgatory entries\n",
291                 kgnilnd_count_list(&souls));
292
293         kgnilnd_release_purgatory_list(&souls);
294
295         /* validate we are now clean */
296         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
297                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
298
299                 /* now all the cons/mboxes should be cleaned up, including purgatory
300                  * so go through and release the MDDs for our persistent PHYS fma_blks
301                  */
302                 kgnilnd_unmap_fma_blocks(dev);
303
304                 LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
305                         "reset failed: fma blocks still live %d\n",
306                         atomic_read(&dev->gnd_nfmablk));
307
308                 LASSERTF(atomic_read(&dev->gnd_neps) == 0,
309                         "reset failed: EP handles still live %d\n",
310                         atomic_read(&dev->gnd_neps));
311         }
312
313         LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
314                 "reset failed: conns left %d\n",
315                 atomic_read(&kgnilnd_data.kgn_nconns));
316
317         /* fine to have peers left - they are waiting for new conns
318          * but should not be holding any open HW resources */
319
320         /* like the last part of kgnilnd_base_shutdown() */
321
322         CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
323
324         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
325                 kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
326         }
327
328         /* no need to free and recreate the TX descriptors
329          * we nuked all the ones that could be using HW resources in
330          * kgnilnd_close_matching_conns and asserted it worked in
331          * kgnilnd_dev_fini */
332
333         /* At this point, all HW is torn down, start to reset */
334
335         /* only reset our known devs */
336         for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
337                 kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
338                 rc = kgnilnd_dev_init(dev);
339                 LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
340                 kgnilnd_map_phys_fmablk(dev);
341                 LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
342                 rc = kgnilnd_setup_wildcard_dgram(dev);
343                 LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
344                         i, rc);
345         }
346
347         /* Now the fun restarts... - release the hounds! */
348
349         end = jiffies;
350         seconds = cfs_duration_sec((long)end - start);
351         kgnilnd_bump_timeouts(seconds, reason);
352
353         kgnilnd_data.kgn_in_reset = 0;
354         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
355         kgnilnd_quiesce_wait(reason);
356         LCONSOLE_WARN("%s reset of all hardware resources\n",
357                 rc ? "failed" : "successful");
358
359         RETURN_EXIT;
360 }
361
362 /* A thread that handles quiece and reset hardware events.
363  * We do the same thing regardless of which device reported the event. */
364 int
365 kgnilnd_ruhroh_thread(void *arg)
366 {
367         int                i = 1;
368         DEFINE_WAIT(wait);
369
370         cfs_block_allsigs();
371         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
372         kgnilnd_data.kgn_ruhroh_running = 1;
373
374         while (1) {
375
376                 /* Block until there's a request..  A reset request could come in
377                  * while we're handling a quiesce one, or vice versa.
378                  * Keep processing requests until there are none.*/
379                 prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
380                 while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
381                                 kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
382                         schedule();
383                 finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
384
385                /* Exit if the driver is shutting down. */
386                 if (kgnilnd_data.kgn_ruhroh_shutdown)
387                         break;
388
389                 /* Serialize with driver startup and shutdown. */
390                 mutex_lock(&kgnilnd_data.kgn_quiesce_mutex);
391
392                CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
393                         kgnilnd_data.kgn_quiesce_trigger,
394                         kgnilnd_data.kgn_needs_reset,
395                         kgnilnd_data.kgn_bump_info_rdy,
396                         kgnilnd_data.kgn_needs_pause);
397
398                 /* Do we need to do a pause/quiesce? */
399                 if (kgnilnd_data.kgn_needs_pause) {
400
401                         /* Pause all other kgnilnd threads. */
402                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
403                         kgnilnd_quiesce_wait("hardware quiesce flag");
404
405                         /* If the hardware quiesce flag is set, wait for it to clear.
406                          * This should happen relatively quickly, so we wait for it.
407                          * This will hold up the eventd thread, but on everything but
408                          * the simulator, this is ok-- there is one thread per core.
409                          *
410                          * Handle (possibly multiple) quiesce events while we wait. The
411                          * memory barrier ensures that the core doesn't start fetching
412                          * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
413                          * matches the second mb in kgnilnd_quiesce_end_callback(). */
414                         smp_rmb();
415                         while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
416
417                                 i++;
418                                 LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
419                                                 "Waiting for hardware quiesce flag to clear\n");
420                                 cfs_pause(cfs_time_seconds(1 * i));
421
422                                 /* If we got a quiesce event with bump info, DO THE BUMP!. */
423                                 if (kgnilnd_data.kgn_bump_info_rdy) {
424                                         /* reset console rate limiting for each event */
425                                         i = 1;
426
427                                         /* Make sure the core doesn't start fetching
428                                          * kgni_quiesce_seconds until after it sees
429                                          * kgn_bump_info_rdy set.  This is the match to the
430                                          * first mb in kgnilnd_quiesce_end_callback(). */
431                                         smp_rmb();
432                                         (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
433                                                                "hardware quiesce callback");
434                                         set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
435                                         set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
436                                 }
437                       }
438
439                         /* Reset the kgn_needs_pause flag before coming out of
440                          * the pause.  This ordering avoids a race with the
441                          * setting of this flag in kgnilnd_pause_threads().  */
442                         set_mb(kgnilnd_data.kgn_needs_pause, 0);
443
444                         /* ok, let the kids back into the pool */
445                         set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
446                         kgnilnd_quiesce_wait("hardware quiesce");
447                 }
448
449                 /* Do a stack reset if needed. */
450                 if (kgnilnd_data.kgn_needs_reset) {
451                         kgnilnd_reset_stack();
452                         set_mb(kgnilnd_data.kgn_needs_reset, 0);
453                 }
454
455                 mutex_unlock(&kgnilnd_data.kgn_quiesce_mutex);
456         }
457
458         kgnilnd_data.kgn_ruhroh_running = 0;
459         return 0;
460 }
461
462 /* Set pause request flag.  Any functions that
463  * call this one are responsible for ensuring that
464  * variables they set up are visible on other cores before
465  * this flag setting.  This executes in interrupt or kernel
466  * thread context.  */
467 void
468 kgnilnd_pause_threads(void)
469 {
470         /* only device 0 gets the handle, see kgnilnd_dev_init */
471         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
472         LASSERTF(dev != NULL, "dev 0 is NULL\n");
473
474         /* If we're currently in a pause triggered by the pause flag,
475          * there's no need to set it again.  We clear the kgn_needs_pause
476          * flag before we reset kgn_quiesce_trigger to avoid a race.  The
477          * read memory barrier matches the setmb() on the trigger in
478          * kgnilnd_ruhroh_task().                                       */
479         smp_rmb();
480         if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
481                         GNILND_IS_QUIESCED)) {
482                  CDEBUG(D_NET, "requesting thread pause\n");
483
484                 kgnilnd_data.kgn_needs_pause = 1;
485
486                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
487         } else {
488             CDEBUG(D_NET, "thread pause already underway\n");
489         }
490 }
491
492 /* Return non-zero if the GNI hardware quiesce flag is set */
493 int
494 kgnilnd_hw_in_quiesce(void)
495 {
496         /* only device 0 gets the handle, see kgnilnd_dev_init */
497         kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
498
499         LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
500
501         smp_rmb();
502         return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
503 }
504
505
506 /* If the GNI hardware quiesce flag is set, initiate our pause and
507  * return non-zero.  Also return non-zero if the stack is shutting down. */
508 int
509 kgnilnd_check_hw_quiesce(void)
510 {
511         if (likely(!kgnilnd_hw_in_quiesce()))
512                 return 0;
513
514         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
515                 CDEBUG(D_NET, "initiating thread pause\n");
516                 kgnilnd_pause_threads();
517         } else {
518                 CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
519         }
520
521         return 1;
522 }
523
524 /* Callback from kngi with the quiesce duration.  This executes
525  * in interrupt context.                                        */
526 void
527 kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
528 {
529         /* only device 0 gets the handle, see kgnilnd_dev_init */
530         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
531         LASSERTF(dev != NULL, "dev 0 is NULL\n");
532
533         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
534
535                 CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
536
537                 /* Save the bump interval and request the bump.
538                  * The memory barrier ensures that the interval is in place before
539                  * the bump flag can be seen (in case a core is already running the
540                  * ruhroh task), and that the bump request flag in place before
541                  * the pause request can be seen (to ensure a core doesn't miss the bump
542                  * request flag).       */
543                 /* If another callback occurred before the ruhroh task
544                  * finished processing the first bump request, we'd over-write its info.
545                  * Nic says that callbacks occur so slowly that this isn't an issue.    */
546                 set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
547                 set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
548                 kgnilnd_pause_threads();
549         } else {
550                 CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
551         }
552 }
553
554 void
555 kgnilnd_critical_error(struct gni_err *err_handle)
556 {
557         /* only device 0 gets the handle, see kgnilnd_dev_init */
558         kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
559         LASSERTF(dev != NULL, "dev 0 is NULL\n");
560
561         if (!kgnilnd_data.kgn_ruhroh_shutdown) {
562                 CDEBUG(D_NET, "requesting stack reset\n");
563                 kgnilnd_data.kgn_needs_reset = 1;
564                 wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
565         } else {
566                 CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
567         }
568 }
569
570 #if defined(GNILND_USE_RCA)
571 #include <krca_lib.h>
572 #define RCA_EVENTS 3
573 /* RCA ticket is needed for krca_wakeup_wait_event() */
574 static krca_ticket_t rca_krt = KRCA_NULL_TICKET;
575 struct rcadata {
576         rca_ticket_t ticket;
577         int subscribed;
578         rs_event_code_t ec;
579 };
580 static struct rcadata rd[RCA_EVENTS] = {
581         {0, 0, ec_node_unavailable},
582         {0, 0, ec_node_available},
583         {0, 0, ec_node_failed}
584 };
585
586 /* thread for receiving rca events */
587 int
588 kgnilnd_rca(void *arg)
589 {
590         int        i, rc;
591         int        retry_count;
592         rs_event_t event;
593         lnet_nid_t nid;
594
595         cfs_block_allsigs();
596
597         /* all gnilnd threads need to run fairly urgently */
598         set_user_nice(current, *kgnilnd_tunables.kgn_nice);
599
600         /*
601          * Register our service with RCA and subscribe to events
602          * of interest.
603          */
604         rca_krt = KRCA_NULL_TICKET;
605         rc = krca_register(&rca_krt, RCA_SVCTYPE_GNILND, current->pid, 0);
606         if (rc < 0) {
607                 CNETERR("krca_register(%x) returned %d\n", current->pid, rc);
608                 goto done;
609         }
610
611         for (i = 0; i < RCA_EVENTS; i++) {
612                 retry_count = 0;
613 subscribe_retry:
614                 rc = krca_subscribe(&rca_krt, rd[i].ec, RCA_RX_SVC_ANY,
615                                     &rd[i].ticket);
616
617                 if ((rc == -EINTR) && !retry_count) {
618                         retry_count++;
619                         CNETERR("krca_subscribe returned %d - retrying\n", rc);
620                         goto subscribe_retry;
621                 }
622
623                 if (rc < 0) {
624                         CNETERR("rca subscription failed (%d)\n", rc);
625                         goto done;
626                 }
627
628                 rd[i].subscribed = 1;
629         }
630
631         while (!kgnilnd_data.kgn_shutdown) {
632                 if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
633                         KGNILND_SPIN_QUIESCE;
634                 }
635                 /* wait here for a subscribed event */
636                 rc = krca_wait_event(&rca_krt);
637
638                 /* RCA return values:
639                  * 0 indicates krca_wakeup_wait_event caused krca_wait_event
640                  *   return.
641                  * -ERESTARTSYS indicates krca_wait_event returned because of a
642                  *   signal.
643                  * -ENOSPC indicates no space available to create an rcad_reg_t
644                  * 1 indicates a message is waiting.
645                  */
646                 if (rc <= 0) {
647                         continue;
648                 }
649
650                 if (krca_get_message(&rca_krt, &event) == 0) {
651                         int node_down = GNILND_RCA_NODE_UNKNOWN;
652                         rs_state_t state;
653                         LIST_HEAD(zombies);
654
655                         /* Compute nodes don't care about other compute nodes
656                          * so we don't need to create a peer.
657                          */
658                         if (GNILND_COMPUTE &&
659                             !RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
660                                         IS_SVC)) {
661                                 continue;
662                         }
663
664                         /* Only care about compute and service nodes not GPUs */
665                         if (RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
666                                         TYPE) != rt_node) {
667                                 continue;
668                         }
669
670                         switch (event.ev_id) {
671                         case ec_node_available:
672                                 CDEBUG(D_INFO, "ec_node_available\n");
673                                 node_down = GNILND_RCA_NODE_UP;
674                                 break;
675                         case ec_node_failed:
676                                 CDEBUG(D_INFO, "ec_node_failed\n");
677                                 if (event.ev_len > 0) {
678                                         CDEBUG(D_ERROR,
679                                                 "ec_node_failed ignored\n");
680                                         break;
681                                 }
682                                 node_down = GNILND_RCA_NODE_DOWN;
683                                 break;
684                         case ec_node_unavailable:
685                                 state = RSN_GET_FLD(event.ev_gen.svid_node.rsn_intval, STATE);
686
687                                 CDEBUG(D_INFO, "ec_node_unavailable\n");
688
689                                 /*
690                                  * Ignore overloaded ec_node_unavailable events
691                                  * generated by 'xtcli set_reserve'.
692                                  */
693                                 if (RS_GET_CS_STATE(state) == RS_CS_READY) {
694                                         CDEBUG(D_INFO, "ignoring "
695                                                 "ec_node_unavailable event with"
696                                                 " RS_CS_READY state\n");
697                                         break;
698                                 }
699                                 node_down = GNILND_RCA_NODE_DOWN;
700                                 break;
701                         default:
702                                 CDEBUG(D_INFO, "unknown event\n");
703                                 break;
704                         }
705
706                         /* if we get an event we don't know about, just go ahead
707                          * and wait for another event */
708                         if (node_down == GNILND_RCA_NODE_UNKNOWN) {
709                                 continue;
710                         }
711
712                         nid = RSN_GET_FLD(event.ev_gen.svid_node.rs_node_flat,
713                                           NID);
714                         CDEBUG(D_INFO,"kgnilnd_rca() reporting nid %d %s\n",
715                                (int)nid, node_down ? "down" : "up");
716                         kgnilnd_report_node_state(nid, node_down);
717
718                 } else {
719                         CNETERR("krca_get_message failed\n");
720                 }
721         }
722
723 done:
724         CDEBUG(D_INFO, "done\n");
725
726         for (i = 0; i < RCA_EVENTS; i++) {
727                 if (rd[i].subscribed) {
728                         rc = krca_unsubscribe(&rca_krt, rd[i].ticket);
729
730                         if (rc) {
731                                 CNETERR("rca unsubscribe failed (%d)\n", rc);
732                         }
733
734                         rd[i].subscribed = 0;
735                 }
736         }
737
738         krca_unregister(&rca_krt);
739         kgnilnd_thread_fini();
740         return 0;
741
742 }
743
744 int
745 kgnilnd_start_rca_thread(void)
746 {
747         return kgnilnd_thread_start(kgnilnd_rca, NULL, "kgnilnd_rca", 0);
748 }
749
750 void
751 kgnilnd_wakeup_rca_thread(void)
752 {
753         int ret;
754
755         ret = krca_wakeup_wait_event(&rca_krt);
756
757         if (ret) {
758                 CDEBUG(D_ERROR, "krca_wakeup_wait_event failed\n");
759         }
760 }
761
762 int
763 kgnilnd_get_node_state(__u32 nid)
764 {
765         int i;
766         int rc = GNILND_RCA_NODE_UNKNOWN;
767         int ret;
768         rs_node_array_t nlist;
769         rs_node_t       *na = NULL;
770
771         if ((ret = krca_get_sysnodes(&nlist)) < 0) {
772                 CDEBUG(D_NETERROR, "krca_get_sysnodes failed %d\n", ret);
773                 goto ns_done;
774         }
775
776         na = nlist.na_ids;
777
778         for (i = 0; i < nlist.na_len; i++) {
779                 if ((rca_nid_t)RSN_GET_FLD(na[i].rs_node_flat, NID) == nid) {
780                         rc = RSN_GET_FLD(na[i].rs_node_flat, STATE) == RS_CS_READY ?
781                                 GNILND_RCA_NODE_UP : GNILND_RCA_NODE_DOWN;
782                         break;
783                 }
784         }
785
786 ns_done:
787         kfree(na);
788         CDEBUG(D_NET, "nid %d rc %d (0=up)\n", nid, rc);
789         return rc;
790 }
791
792 #else /* GNILND_USE_RCA */
793
794 int
795 kgnilnd_start_rca_thread(void)
796 {
797         return 0;
798 }
799
800 void
801 kgnilnd_wakeup_rca_thread(void)
802 {
803 }
804
805 int
806 kgnilnd_get_node_state(__u32 nid)
807 {
808         return GNILND_RCA_NODE_UP;
809 }
810 #endif /* GNILND_USE_RCA */